import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm2
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from statsmodels.graphics.gofplots import qqplot_2samples, qqplot
import seaborn as sns
import statistics
%matplotlib inline
Section 1 – Objectives of Your Project
Section 3 – Correlation Analysis
Section 4 – Linear Regression Models
The project objective is to find the style of play of each team in different periods and how it affects the game win
Who needs the answers:
Fans of the game
Coaching staff and players to analyze competitors' strategy and make their own strategy
Sports writers that need material for their writings
How do we do it?
Level of significance: 95% (alpha is 5%)
RMSE: because of nature of the dependent variable, RMSE does not need to be too accurate (less than 1). We are suggesting the acceptable range for RMSE is +/ 5 games won. Or Scatter Index (normalized measure of error) is less than 5%
df=pd.read_csv('baseball_teams.csv',header=0)
df.head(10)
| Year | League | Team | Franchise | Division | Final_Standing | Games_Played | Unnamed: 7 | Games_Won | Games_Lost | ... | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | Home_Ball_Park | Attendance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1871 | NaN | BS1 | BNA | NaN | 3 | 31 | NaN | 20 | 10 | ... | 367 | 2 | 42 | 23 | 225 | NaN | 0.83 | Boston Red Stockings | South End Grounds I | NaN |
| 1 | 1871 | NaN | CH1 | CNA | NaN | 2 | 28 | NaN | 19 | 9 | ... | 308 | 6 | 28 | 22 | 218 | NaN | 0.82 | Chicago White Stockings | Union Base-Ball Grounds | NaN |
| 2 | 1871 | NaN | CL1 | CFC | NaN | 8 | 29 | NaN | 10 | 19 | ... | 346 | 13 | 53 | 34 | 223 | NaN | 0.81 | Cleveland Forest Citys | National Association Grounds | NaN |
| 3 | 1871 | NaN | FW1 | KEK | NaN | 7 | 19 | NaN | 7 | 12 | ... | 261 | 5 | 21 | 17 | 163 | NaN | 0.80 | Fort Wayne Kekiongas | Hamilton Field | NaN |
| 4 | 1871 | NaN | NY2 | NNA | NaN | 5 | 33 | NaN | 16 | 17 | ... | 373 | 7 | 42 | 22 | 227 | NaN | 0.83 | New York Mutuals | Union Grounds (Brooklyn) | NaN |
| 5 | 1871 | NaN | PH1 | PNA | NaN | 1 | 28 | NaN | 21 | 7 | ... | 329 | 3 | 53 | 16 | 194 | NaN | 0.84 | Philadelphia Athletics | Jefferson Street Grounds | NaN |
| 6 | 1871 | NaN | RC1 | ROK | NaN | 9 | 25 | NaN | 4 | 21 | ... | 315 | 3 | 34 | 16 | 220 | NaN | 0.82 | Rockford Forest Citys | Agricultural Society Fair Grounds | NaN |
| 7 | 1871 | NaN | TRO | TRO | NaN | 6 | 29 | NaN | 13 | 15 | ... | 431 | 4 | 75 | 12 | 198 | NaN | 0.84 | Troy Haymakers | Haymakers' Grounds | . |
| 8 | 1871 | NaN | WS3 | OLY | NaN | 4 | 32 | NaN | 15 | 15 | ... | 371 | 4 | 45 | 13 | 217 | NaN | 0.85 | Washington Olympics | Olympics Grounds | NaN |
| 9 | 1872 | NaN | BL1 | BLC | NaN | 2 | 58 | NaN | 35 | 19 | ... | 566 | 3 | 63 | 0 | 432 | NaN | 0.82 | Baltimore Canaries | Newington Park | NaN |
10 rows × 43 columns
df.describe()
| Year | Final_Standing | Games_Played | Unnamed: 7 | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | Doubles | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2805.000000 | 2805.000000 | 2805.00000 | 2406.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | ... | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2488.000000 | 2805.000000 |
| mean | 1955.036720 | 4.107308 | 150.34795 | 78.465919 | 74.749020 | 74.749020 | 681.945811 | 5142.492335 | 1346.273440 | 227.624955 | ... | 9.664171 | 23.667736 | 4022.383244 | 1346.083779 | 101.136542 | 474.010695 | 731.229234 | 186.337255 | 140.186495 | 0.961519 |
| std | 41.519083 | 2.323414 | 23.22725 | 4.698684 | 17.640402 | 17.378079 | 135.738244 | 750.551691 | 219.891603 | 58.692602 | ... | 5.097328 | 16.281300 | 630.996942 | 219.521064 | 58.245002 | 131.890032 | 296.409881 | 107.657444 | 29.322764 | 0.030224 |
| min | 1871.000000 | 1.000000 | 6.00000 | 44.000000 | 0.000000 | 4.000000 | 24.000000 | 211.000000 | 33.000000 | 3.000000 | ... | 0.000000 | 0.000000 | 162.000000 | 49.000000 | 0.000000 | 0.000000 | 0.000000 | 47.000000 | 18.000000 | 0.760000 |
| 25% | 1919.000000 | 2.000000 | 153.00000 | 77.000000 | 66.000000 | 65.000000 | 613.000000 | 5127.000000 | 1299.000000 | 193.000000 | ... | 6.000000 | 9.000000 | 4077.000000 | 1288.000000 | 46.000000 | 427.000000 | 501.000000 | 116.000000 | 126.000000 | 0.960000 |
| 50% | 1963.000000 | 4.000000 | 157.00000 | 81.000000 | 77.000000 | 76.000000 | 690.000000 | 5389.000000 | 1393.000000 | 231.000000 | ... | 9.000000 | 24.000000 | 4236.000000 | 1392.000000 | 109.000000 | 494.000000 | 735.000000 | 145.000000 | 145.000000 | 0.970000 |
| 75% | 1992.000000 | 6.000000 | 162.00000 | 81.000000 | 87.000000 | 87.000000 | 763.000000 | 5517.000000 | 1467.000000 | 270.000000 | ... | 13.000000 | 38.000000 | 4341.000000 | 1470.000000 | 148.000000 | 555.000000 | 965.000000 | 217.000000 | 159.250000 | 0.980000 |
| max | 2015.000000 | 13.000000 | 165.00000 | 84.000000 | 116.000000 | 134.000000 | 1220.000000 | 5781.000000 | 1783.000000 | 376.000000 | ... | 32.000000 | 68.000000 | 4518.000000 | 1993.000000 | 241.000000 | 827.000000 | 1450.000000 | 639.000000 | 217.000000 | 0.991000 |
8 rows × 32 columns
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2805 entries, 0 to 2804 Data columns (total 43 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 2805 non-null int64 1 League 2755 non-null object 2 Team 2805 non-null object 3 Franchise 2805 non-null object 4 Division 1288 non-null object 5 Final_Standing 2805 non-null int64 6 Games_Played 2805 non-null int64 7 Unnamed: 7 2406 non-null float64 8 Games_Won 2805 non-null int64 9 Games_Lost 2805 non-null int64 10 Unnamed: 10 1260 non-null object 11 Unnamed: 11 624 non-null object 12 League_Win 2777 non-null object 13 World_Series 2448 non-null object 14 Runs_Scored 2805 non-null int64 15 At_Bats 2805 non-null int64 16 Hits 2805 non-null int64 17 Doubles 2805 non-null int64 18 Triples 2805 non-null int64 19 Home_Runs 2805 non-null int64 20 Walks 2805 non-null int64 21 Strike_Outs 2685 non-null float64 22 Stolen_Bases 2661 non-null float64 23 Caught_Stealing 1946 non-null float64 24 Hit_By_Pitch 480 non-null float64 25 Sacrifice_Fly 480 non-null float64 26 Runs_Against 2805 non-null int64 27 Earned_Runs 2805 non-null int64 28 Earned_Run_Average 2805 non-null float64 29 Complete_Games 2805 non-null int64 30 Shutout 2805 non-null int64 31 Saves 2805 non-null int64 32 Infield_Put_Outs 2805 non-null int64 33 Hits_Allowed 2805 non-null int64 34 Home_Run_Allowed 2805 non-null int64 35 Walks_Allowed 2805 non-null int64 36 Strikeouts_Allowed 2805 non-null int64 37 Errors 2805 non-null int64 38 Double_Plays 2488 non-null float64 39 Fielding_Percentage 2805 non-null float64 40 Team_Name 2805 non-null object 41 Home_Ball_Park 2771 non-null object 42 Attendance 2527 non-null object dtypes: float64(9), int64(23), object(11) memory usage: 942.4+ KB None
df.columns
Index(['Year', 'League', 'Team', 'Franchise ', 'Division', 'Final_Standing',
'Games_Played', 'Unnamed: 7', 'Games_Won', 'Games_Lost', 'Unnamed: 10',
'Unnamed: 11', 'League_Win', 'World_Series', 'Runs_Scored', 'At_Bats',
'Hits', 'Doubles', 'Triples', 'Home_Runs', 'Walks', 'Strike_Outs',
'Stolen_Bases', 'Caught_Stealing', 'Hit_By_Pitch', 'Sacrifice_Fly',
'Runs_Against', 'Earned_Runs', 'Earned_Run_Average', 'Complete_Games',
'Shutout', 'Saves', 'Infield_Put_Outs', 'Hits_Allowed',
'Home_Run_Allowed', 'Walks_Allowed', 'Strikeouts_Allowed', 'Errors',
'Double_Plays', 'Fielding_Percentage', 'Team_Name', 'Home_Ball_Park',
'Attendance'],
dtype='object')
From the information above, there are 2805 rows in total, which means each column needs to have 2805 entries for every columns. Only columns below have 2085 entries, so we'll keep them in the dataset
Year, Team, Franchise, Final Standing, Games_Played, Games_Won, Games_Lost, Runs_Scored, At_Bats, Hits, Doubles, Triples, Home_Runs, Walks, Runs_Against, Earned_Runs, Earned_Run_Average, Complete_Games, Shutout, Saves, Infield_Put_Outs, Hits_Allowed, Home_Run_Allowed, Walks_Allowed, Strikeouts_Allowed, Errors, Fielding_Percentage, Team_Name
The rest of the columns have some missing datas, so we now decide how we are going to do with it
First, there are 3 unnamed columns (Unnamed: 7, Unnamed: 10, Unnamed: 11), since the data itself (binary variables and discrete variables) cannot help us indentify the columns' name, we then remove the all 3 unnamed columns
Next, we have some missing data for the variables that describe style of play. The columns that have missing data are: Strike_Outs, Stolen_Bases, Caught_Stealing, Hit_By_Pitch, Sacrifice_Fly (Offensive Style) and Double Plays (Defensive Style)
Hit_By_Pitch and Sacrifice_Fly, only have 480 out of 2805 rows. Since there are too many missing rows so we will remove these 2 variables/columns from the dataset
Caught_Stealing only has 1946 out of 2805 entries. We will remove this column from the dataset
Strike_Out has full data from 1913 to 2015, so we can use this for period 3, 4 and the prediction
Stolen_Base has full data from 1886, so we can use this for period 2, 3, 4 and the prediction
Double_Plays has full data from 1900, so we can use this for period 2, 3, 4 and the prediction
We then also have other columns that have missing data. But those columns are either nominal variables (qualitative data) or have no affects on the Games Won (Dependent Variable). We can either replace the missing data with 0 or drop the columns; in this project, we decided to drop the columns. Those columns to be removed are League, Division, League_Win, World_Series, Home_Ball_Park, Attendance
Now, we are dropping the columns that are not necessary and have missing data
data_baseball=df.drop(['Sacrifice_Fly','Hit_By_Pitch','Unnamed: 7','Unnamed: 10','Unnamed: 11','League','Division', 'League_Win', 'World_Series', 'Home_Ball_Park', 'Attendance', 'Caught_Stealing'], axis='columns', inplace=False)
data_baseball.head()
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1871 | BS1 | BNA | 3 | 31 | 20 | 10 | 401 | 1372 | 426 | ... | 3 | 828 | 367 | 2 | 42 | 23 | 225 | NaN | 0.83 | Boston Red Stockings |
| 1 | 1871 | CH1 | CNA | 2 | 28 | 19 | 9 | 302 | 1196 | 323 | ... | 1 | 753 | 308 | 6 | 28 | 22 | 218 | NaN | 0.82 | Chicago White Stockings |
| 2 | 1871 | CL1 | CFC | 8 | 29 | 10 | 19 | 249 | 1186 | 328 | ... | 0 | 762 | 346 | 13 | 53 | 34 | 223 | NaN | 0.81 | Cleveland Forest Citys |
| 3 | 1871 | FW1 | KEK | 7 | 19 | 7 | 12 | 137 | 746 | 178 | ... | 0 | 507 | 261 | 5 | 21 | 17 | 163 | NaN | 0.80 | Fort Wayne Kekiongas |
| 4 | 1871 | NY2 | NNA | 5 | 33 | 16 | 17 | 302 | 1404 | 403 | ... | 0 | 879 | 373 | 7 | 42 | 22 | 227 | NaN | 0.83 | New York Mutuals |
5 rows × 31 columns
data_baseball.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2805 entries, 0 to 2804 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 2805 non-null int64 1 Team 2805 non-null object 2 Franchise 2805 non-null object 3 Final_Standing 2805 non-null int64 4 Games_Played 2805 non-null int64 5 Games_Won 2805 non-null int64 6 Games_Lost 2805 non-null int64 7 Runs_Scored 2805 non-null int64 8 At_Bats 2805 non-null int64 9 Hits 2805 non-null int64 10 Doubles 2805 non-null int64 11 Triples 2805 non-null int64 12 Home_Runs 2805 non-null int64 13 Walks 2805 non-null int64 14 Strike_Outs 2685 non-null float64 15 Stolen_Bases 2661 non-null float64 16 Runs_Against 2805 non-null int64 17 Earned_Runs 2805 non-null int64 18 Earned_Run_Average 2805 non-null float64 19 Complete_Games 2805 non-null int64 20 Shutout 2805 non-null int64 21 Saves 2805 non-null int64 22 Infield_Put_Outs 2805 non-null int64 23 Hits_Allowed 2805 non-null int64 24 Home_Run_Allowed 2805 non-null int64 25 Walks_Allowed 2805 non-null int64 26 Strikeouts_Allowed 2805 non-null int64 27 Errors 2805 non-null int64 28 Double_Plays 2488 non-null float64 29 Fielding_Percentage 2805 non-null float64 30 Team_Name 2805 non-null object dtypes: float64(5), int64(23), object(3) memory usage: 679.5+ KB
We now have the complete data, we can explore and determine data types provided in the complete dataset
| Column Name | Data Type | Category |
|---|---|---|
| Year | Nominal | Descriptive Fields |
| Team | Nominal | Descriptive Fields |
| Franchise | Nominal | Descriptive Fields |
| Final_Standing | Discrete | Team Performance |
| Games_Played | Discrete | Team Performance |
| Games_Won | Discrete | Team Performance |
| Games_Lost | Discrete | Team Performance |
| Runs_Scored | Discrete | Team Performance |
| At_Bats | Discrete | Offensive Measurements |
| Hits | Discrete | Offensive Measurements |
| Doubles | Discrete | Offensive Measurements |
| Triples | Discrete | Offensive Measurements |
| Home_Runs | Discrete | Offensive Measurements |
| Walks | Discrete | Offensive Measurements |
| Strike_Outs | Discrete | Offensive Measurements |
| Stolen_Bases | Discrete | Offensive Measurements |
| Caught_Stealing | Discrete | removed |
| Hit_By_Pitch | Discrete | removed |
| Sacrifice_Fly | Discrete | removed |
| Runs_Against | Discrete | Defensive Measurements |
| Earned_Runs | Discrete | Defensive Measurements |
| Earned_Run_Average | Discrete | Defensive Measurements |
| Complete_Games | Discrete | Defensive Measurements |
| Shutout | Discrete | Defensive Measurements |
| Saves | Discrete | Defensive Measurements |
| Infield_Put_Outs | Discrete | Defensive Measurements |
| Hits_Allowed | Discrete | Defensive Measurements |
| Walks_Allowed | Discrete | Defensive Measurements |
| Strikeouts_Allowed | Discrete | Defensive Measurements |
| Errors | Discrete | Defensive Measurements |
| Double_Plays | Discrete | Defensive Measurements |
| Fielding_Percentage | Continious | Defensive Measurements |
| Team_Name | Nominal | Miscellaneous |
As mentioned in the Objective, we are going to determine the style of play of each team during the years and how the style of play affects their chance to win.
The dependent variable is Games_Won
The potential independent variables that will be selected to build the model are all the columns that have either Offensive Measurements/Defensive Measurements in the Category in the table above
data_baseball.describe()
| Year | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | Doubles | Triples | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2805.000000 | 2805.000000 | 2805.00000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.00000 | ... | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2805.000000 | 2488.000000 | 2805.000000 |
| mean | 1955.036720 | 4.107308 | 150.34795 | 74.749020 | 74.749020 | 681.945811 | 5142.492335 | 1346.273440 | 227.624955 | 47.10410 | ... | 9.664171 | 23.667736 | 4022.383244 | 1346.083779 | 101.136542 | 474.010695 | 731.229234 | 186.337255 | 140.186495 | 0.961519 |
| std | 41.519083 | 2.323414 | 23.22725 | 17.640402 | 17.378079 | 135.738244 | 750.551691 | 219.891603 | 58.692602 | 22.26862 | ... | 5.097328 | 16.281300 | 630.996942 | 219.521064 | 58.245002 | 131.890032 | 296.409881 | 107.657444 | 29.322764 | 0.030224 |
| min | 1871.000000 | 1.000000 | 6.00000 | 0.000000 | 4.000000 | 24.000000 | 211.000000 | 33.000000 | 3.000000 | 0.00000 | ... | 0.000000 | 0.000000 | 162.000000 | 49.000000 | 0.000000 | 0.000000 | 0.000000 | 47.000000 | 18.000000 | 0.760000 |
| 25% | 1919.000000 | 2.000000 | 153.00000 | 66.000000 | 65.000000 | 613.000000 | 5127.000000 | 1299.000000 | 193.000000 | 31.00000 | ... | 6.000000 | 9.000000 | 4077.000000 | 1288.000000 | 46.000000 | 427.000000 | 501.000000 | 116.000000 | 126.000000 | 0.960000 |
| 50% | 1963.000000 | 4.000000 | 157.00000 | 77.000000 | 76.000000 | 690.000000 | 5389.000000 | 1393.000000 | 231.000000 | 41.00000 | ... | 9.000000 | 24.000000 | 4236.000000 | 1392.000000 | 109.000000 | 494.000000 | 735.000000 | 145.000000 | 145.000000 | 0.970000 |
| 75% | 1992.000000 | 6.000000 | 162.00000 | 87.000000 | 87.000000 | 763.000000 | 5517.000000 | 1467.000000 | 270.000000 | 60.00000 | ... | 13.000000 | 38.000000 | 4341.000000 | 1470.000000 | 148.000000 | 555.000000 | 965.000000 | 217.000000 | 159.250000 | 0.980000 |
| max | 2015.000000 | 13.000000 | 165.00000 | 116.000000 | 134.000000 | 1220.000000 | 5781.000000 | 1783.000000 | 376.000000 | 150.00000 | ... | 32.000000 | 68.000000 | 4518.000000 | 1993.000000 | 241.000000 | 827.000000 | 1450.000000 | 639.000000 | 217.000000 | 0.991000 |
8 rows × 28 columns
data_baseball.columns
Index(['Year', 'Team', 'Franchise ', 'Final_Standing', 'Games_Played',
'Games_Won', 'Games_Lost', 'Runs_Scored', 'At_Bats', 'Hits', 'Doubles',
'Triples', 'Home_Runs', 'Walks', 'Strike_Outs', 'Stolen_Bases',
'Runs_Against', 'Earned_Runs', 'Earned_Run_Average', 'Complete_Games',
'Shutout', 'Saves', 'Infield_Put_Outs', 'Hits_Allowed',
'Home_Run_Allowed', 'Walks_Allowed', 'Strikeouts_Allowed', 'Errors',
'Double_Plays', 'Fielding_Percentage', 'Team_Name'],
dtype='object')
offensive = ['Runs_Scored', 'At_Bats', 'Hits', 'Doubles',
'Triples', 'Home_Runs', 'Walks', 'Strike_Outs', 'Stolen_Bases']
for i in offensive:
qqplot(data_baseball[i],line='s')
plt.title(label=i)
All of them are not normal distribution except Double & Runs_Scored. Data from these 2 variable is is not precisely normal, but not too far off.
defensive = ['Runs_Against',
'Earned_Runs', 'Earned_Run_Average', 'Complete_Games', 'Shutout',
'Saves', 'Infield_Put_Outs', 'Hits_Allowed', 'Home_Run_Allowed',
'Walks_Allowed', 'Strikeouts_Allowed', 'Errors', 'Double_Plays',
'Fielding_Percentage']
for i in defensive:
qqplot(data_baseball[i],line='s')
plt.title(label=i)
All of them are not normal distribution except Earned_Run_Average. Data from Earned_Run_Average variable is is not precisely normal, but not too far off.
Since most of data from the variables are not normally distributed, we will use Spearman method to calculate the correlation
data_baseball.corr(method='spearman',numeric_only=True)
| Year | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | Doubles | Triples | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | -0.274699 | 0.781980 | 0.315051 | 0.340399 | 0.240707 | 0.766968 | 0.432009 | 0.678671 | -0.631983 | ... | 0.097021 | 0.892868 | 0.743554 | 0.422593 | 0.865201 | 0.458994 | 0.883236 | -0.888545 | 0.361047 | 0.916595 |
| Final_Standing | -0.274699 | 1.000000 | -0.213166 | -0.801612 | 0.640538 | -0.459954 | -0.252544 | -0.344243 | -0.334162 | 0.088824 | ... | -0.372299 | -0.390545 | -0.290483 | 0.186951 | -0.145467 | 0.107300 | -0.326219 | 0.341624 | -0.014340 | -0.338664 |
| Games_Played | 0.781980 | -0.213166 | 1.000000 | 0.426088 | 0.395069 | 0.240833 | 0.866048 | 0.472664 | 0.548309 | -0.434139 | ... | 0.258410 | 0.786178 | 0.931249 | 0.445805 | 0.720693 | 0.464576 | 0.816380 | -0.632648 | 0.354613 | 0.714013 |
| Games_Won | 0.315051 | -0.801612 | 0.426088 | 1.000000 | -0.476318 | 0.575551 | 0.450319 | 0.537469 | 0.434675 | 0.013223 | ... | 0.536561 | 0.491225 | 0.510420 | -0.010287 | 0.234248 | 0.051055 | 0.424356 | -0.356374 | 0.142152 | 0.408746 |
| Games_Lost | 0.340399 | 0.640538 | 0.395069 | -0.476318 | 1.000000 | -0.226362 | 0.327896 | 0.051015 | 0.167462 | -0.159084 | ... | -0.179071 | 0.181630 | 0.286110 | 0.588136 | 0.412657 | 0.513484 | 0.258669 | -0.141217 | 0.223401 | 0.232301 |
| Runs_Scored | 0.240707 | -0.459954 | 0.240833 | 0.575551 | -0.226362 | 1.000000 | 0.468626 | 0.757546 | 0.616222 | 0.132385 | ... | -0.069142 | 0.298107 | 0.219980 | 0.411270 | 0.375124 | 0.272330 | 0.226619 | -0.197410 | 0.352792 | 0.261518 |
| At_Bats | 0.766968 | -0.252544 | 0.866048 | 0.450319 | 0.327896 | 0.468626 | 1.000000 | 0.742428 | 0.732729 | -0.366264 | ... | 0.121713 | 0.755724 | 0.859493 | 0.603994 | 0.759212 | 0.495107 | 0.740341 | -0.631324 | 0.461246 | 0.717702 |
| Hits | 0.432009 | -0.344243 | 0.472664 | 0.537469 | 0.051015 | 0.757546 | 0.742428 | 1.000000 | 0.785740 | 0.022698 | ... | 0.029691 | 0.459474 | 0.456512 | 0.643577 | 0.486024 | 0.366194 | 0.342793 | -0.363469 | 0.405722 | 0.446763 |
| Doubles | 0.678671 | -0.334162 | 0.548309 | 0.434675 | 0.167462 | 0.616222 | 0.732729 | 0.785740 | 1.000000 | -0.220792 | ... | 0.004934 | 0.623705 | 0.513107 | 0.617070 | 0.650222 | 0.422942 | 0.560636 | -0.581621 | 0.366630 | 0.661537 |
| Triples | -0.631983 | 0.088824 | -0.434139 | 0.013223 | -0.159084 | 0.132385 | -0.366264 | 0.022698 | -0.220792 | 1.000000 | ... | 0.051694 | -0.551959 | -0.414802 | -0.068064 | -0.551999 | -0.198915 | -0.565666 | 0.635171 | -0.191162 | -0.559284 |
| Home_Runs | 0.823918 | -0.347434 | 0.693135 | 0.450463 | 0.179379 | 0.512871 | 0.757164 | 0.541482 | 0.660915 | -0.532944 | ... | 0.045617 | 0.803792 | 0.662230 | 0.439857 | 0.886188 | 0.499955 | 0.778469 | -0.764482 | 0.433687 | 0.792145 |
| Walks | 0.461719 | -0.305729 | 0.483255 | 0.490243 | 0.082485 | 0.539667 | 0.490052 | 0.447216 | 0.490117 | -0.199808 | ... | 0.118947 | 0.508623 | 0.487373 | 0.366321 | 0.525874 | 0.545336 | 0.447933 | -0.434163 | 0.378155 | 0.478526 |
| Strike_Outs | 0.895568 | -0.200366 | 0.813299 | 0.302005 | 0.402110 | 0.154026 | 0.719682 | 0.259131 | 0.530377 | -0.566149 | ... | 0.180225 | 0.825208 | 0.772673 | 0.312014 | 0.822780 | 0.431565 | 0.920380 | -0.745329 | 0.229025 | 0.797131 |
| Stolen_Bases | -0.192581 | -0.082728 | -0.063741 | 0.048425 | -0.137192 | 0.080899 | -0.176351 | -0.131850 | -0.175577 | 0.304038 | ... | 0.029598 | -0.143803 | -0.051383 | -0.205197 | -0.289459 | -0.221423 | -0.080675 | 0.296674 | -0.342434 | -0.286610 |
| Runs_Against | 0.253098 | 0.355105 | 0.223545 | -0.242123 | 0.600748 | 0.439777 | 0.376841 | 0.438892 | 0.442703 | 0.019831 | ... | -0.500155 | 0.139218 | 0.097777 | 0.794766 | 0.479119 | 0.551323 | 0.126411 | -0.066519 | 0.379589 | 0.161943 |
| Earned_Runs | 0.595115 | 0.168519 | 0.498714 | -0.046020 | 0.619575 | 0.431541 | 0.636378 | 0.578177 | 0.663727 | -0.257239 | ... | -0.342406 | 0.478491 | 0.385637 | 0.868230 | 0.762225 | 0.673869 | 0.429312 | -0.448664 | 0.484250 | 0.524027 |
| Earned_Run_Average | 0.387392 | 0.305546 | 0.189396 | -0.274268 | 0.500262 | 0.350832 | 0.356915 | 0.401457 | 0.485021 | -0.207851 | ... | -0.566322 | 0.243548 | 0.066119 | 0.732008 | 0.579502 | 0.521991 | 0.176583 | -0.300424 | 0.404286 | 0.312411 |
| Complete_Games | -0.907837 | 0.185047 | -0.661237 | -0.136375 | -0.286972 | -0.127545 | -0.658504 | -0.312849 | -0.578901 | 0.719088 | ... | 0.085239 | -0.842178 | -0.620513 | -0.353932 | -0.802188 | -0.406879 | -0.784269 | 0.838758 | -0.321861 | -0.810418 |
| Shutout | 0.097021 | -0.372299 | 0.258410 | 0.536561 | -0.179071 | -0.069142 | 0.121713 | 0.029691 | 0.004934 | 0.051694 | ... | 1.000000 | 0.180824 | 0.336574 | -0.271823 | -0.070023 | -0.093790 | 0.246854 | -0.136851 | -0.090311 | 0.182850 |
| Saves | 0.892868 | -0.390545 | 0.786178 | 0.491225 | 0.181630 | 0.298107 | 0.755724 | 0.459474 | 0.623705 | -0.551959 | ... | 0.180824 | 1.000000 | 0.787709 | 0.337349 | 0.785848 | 0.424078 | 0.844665 | -0.814831 | 0.349019 | 0.834160 |
| Infield_Put_Outs | 0.743554 | -0.290483 | 0.931249 | 0.510420 | 0.286110 | 0.219980 | 0.859493 | 0.456512 | 0.513107 | -0.414802 | ... | 0.336574 | 0.787709 | 1.000000 | 0.358175 | 0.650998 | 0.404665 | 0.802176 | -0.618994 | 0.325047 | 0.697420 |
| Hits_Allowed | 0.422593 | 0.186951 | 0.445805 | -0.010287 | 0.588136 | 0.411270 | 0.603994 | 0.643577 | 0.617070 | -0.068064 | ... | -0.271823 | 0.337349 | 0.358175 | 1.000000 | 0.550584 | 0.469565 | 0.238806 | -0.254858 | 0.454944 | 0.364834 |
| Home_Run_Allowed | 0.865201 | -0.145467 | 0.720693 | 0.234248 | 0.412657 | 0.375124 | 0.759212 | 0.486024 | 0.650222 | -0.551999 | ... | -0.070023 | 0.785848 | 0.650998 | 0.550584 | 1.000000 | 0.557030 | 0.781631 | -0.772648 | 0.449520 | 0.805396 |
| Walks_Allowed | 0.458994 | 0.107300 | 0.464576 | 0.051055 | 0.513484 | 0.272330 | 0.495107 | 0.366194 | 0.422942 | -0.198915 | ... | -0.093790 | 0.424078 | 0.404665 | 0.469565 | 0.557030 | 1.000000 | 0.420674 | -0.355897 | 0.489310 | 0.410029 |
| Strikeouts_Allowed | 0.883236 | -0.326219 | 0.816380 | 0.424356 | 0.258669 | 0.226619 | 0.740341 | 0.342793 | 0.560636 | -0.565666 | ... | 0.246854 | 0.844665 | 0.802176 | 0.238806 | 0.781631 | 0.420674 | 1.000000 | -0.769876 | 0.215895 | 0.811301 |
| Errors | -0.888545 | 0.341624 | -0.632648 | -0.356374 | -0.141217 | -0.197410 | -0.631324 | -0.363469 | -0.581621 | 0.635171 | ... | -0.136851 | -0.814831 | -0.618994 | -0.254858 | -0.772648 | -0.355897 | -0.769876 | 1.000000 | -0.346504 | -0.933260 |
| Double_Plays | 0.361047 | -0.014340 | 0.354613 | 0.142152 | 0.223401 | 0.352792 | 0.461246 | 0.405722 | 0.366630 | -0.191162 | ... | -0.090311 | 0.349019 | 0.325047 | 0.454944 | 0.449520 | 0.489310 | 0.215895 | -0.346504 | 1.000000 | 0.393810 |
| Fielding_Percentage | 0.916595 | -0.338664 | 0.714013 | 0.408746 | 0.232301 | 0.261518 | 0.717702 | 0.446763 | 0.661537 | -0.559284 | ... | 0.182850 | 0.834160 | 0.697420 | 0.364834 | 0.805396 | 0.410029 | 0.811301 | -0.933260 | 0.393810 | 1.000000 |
28 rows × 28 columns
print(data_baseball.corr(method='spearman',numeric_only=True)['Games_Won'].sort_values(ascending=False))
Games_Won 1.000000 Runs_Scored 0.575551 Hits 0.537469 Shutout 0.536561 Infield_Put_Outs 0.510420 Saves 0.491225 Walks 0.490243 Home_Runs 0.450463 At_Bats 0.450319 Doubles 0.434675 Games_Played 0.426088 Strikeouts_Allowed 0.424356 Fielding_Percentage 0.408746 Year 0.315051 Strike_Outs 0.302005 Home_Run_Allowed 0.234248 Double_Plays 0.142152 Walks_Allowed 0.051055 Stolen_Bases 0.048425 Triples 0.013223 Hits_Allowed -0.010287 Earned_Runs -0.046020 Complete_Games -0.136375 Runs_Against -0.242123 Earned_Run_Average -0.274268 Errors -0.356374 Games_Lost -0.476318 Final_Standing -0.801612 Name: Games_Won, dtype: float64
Some independent variables are dependent of each other (correlation is over 0.7)
Variables that are not dependent of other independent variables: Walks, Stolen_Bases, Shutout, Walks_Allowed, Double Plays
Assuming we use data of the entire data set to build the model, top 8 independent variables that the biggest correlations are Runs_Scored, Hits, Shutout, Infield_Put_Outs, Saves, Walks, Home_Runs, At_Bats. We can first keep Shutout, Walks because these 2 variables are not dependent of any other independent variables. Between Runs_Scored, Hits, Saves, Walks, Home_Runs, and At_Bats, since these are highly dependent of each other, we can only keep maximum 2 of them
Because this is a general data analysis for the entire data set, we will not eliminate these variables from the dataset for now, but we will keep it into conderation when choosing top 8 variables to build the regression models. If there are independent variables that have high correlation with others in top 8 list, we can only keep one of them in the model
offensive = ['Runs_Scored', 'At_Bats', 'Hits', 'Doubles',
'Triples', 'Home_Runs', 'Walks', 'Strike_Outs', 'Stolen_Bases']
for i in offensive:
sns.scatterplot(data_baseball,x=i,y='Games_Won')
plt.show()
Independent variables that don't have a linear relatinship to the dependent variable are Stolen Bases, Triples
for i in defensive:
sns.scatterplot(data_baseball,x=i,y='Games_Won')
plt.show()
Independent variables that don't have a linear relatinship to the dependent variable Runs_Against, Home_Run_Allowed, Earned_Run, Earned Run Average, Double Play, Complete Games, Hits_Allowed
| Column Name | Data Type | Category | Normality | # of missing values | Action |
|---|---|---|---|---|---|
| Year | Nominal | Descriptive Fields | |||
| Team | Nominal | Descriptive Fields | |||
| Franchise | Nominal | Descriptive Fields | |||
| Final_Standing | Discrete | Team Performance | |||
| Games_Played | Discrete | Team Performance | |||
| Games_Won | Discrete | Team Performance | |||
| Games_Lost | Discrete | Team Performance | |||
| Runs_Scored | Discrete | Team Performance | Yes | ||
| At_Bats | Discrete | Offensive Measurements | |||
| Hits | Discrete | Offensive Measurements | |||
| Doubles | Discrete | Offensive Measurements | Yes | ||
| Triples | Discrete | Offensive Measurements | |||
| Home_Runs | Discrete | Offensive Measurements | |||
| Walks | Discrete | Offensive Measurements | |||
| Strike_Outs | Discrete | Offensive Measurements | 120 | used for period 3, 4 | |
| Stolen_Bases | Discrete | Offensive Measurements | 144 | used for period 2, 3, 4 | |
| Caught_Stealing | Discrete | Offensive Measurements | 859 | to be removed | |
| Hit_By_Pitch | Discrete | Offensive Measurements | 2325 | to be removed | |
| Sacrifice_Fly | Discrete | Offensive Measurements | 2325 | to be removed | |
| Runs_Against | Discrete | Defensive Measurements | |||
| Earned_Runs | Discrete | Defensive Measurements | |||
| Earned_Run_Average | Discrete | Defensive Measurements | Yes | ||
| Complete_Games | Discrete | Defensive Measurements | |||
| Shutout | Discrete | Defensive Measurements | |||
| Saves | Discrete | Defensive Measurements | |||
| Infield_Put_Outs | Discrete | Defensive Measurements | |||
| Hits_Allowed | Discrete | Defensive Measurements | |||
| Walks_Allowed | Discrete | Defensive Measurements | |||
| Strikeouts_Allowed | Discrete | Defensive Measurements | |||
| Errors | Discrete | Defensive Measurements | |||
| Double_Plays | Discrete | Defensive Measurements | 317 | used for period 2, 3, 4 | |
| Fielding_Percentage | Continious | Defensive Measurements | |||
| Team_Name | Nominal | Miscellaneous |
data_baseball.corr(method='spearman',numeric_only=True)
| Year | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | Doubles | Triples | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | -0.274699 | 0.781980 | 0.315051 | 0.340399 | 0.240707 | 0.766968 | 0.432009 | 0.678671 | -0.631983 | ... | 0.097021 | 0.892868 | 0.743554 | 0.422593 | 0.865201 | 0.458994 | 0.883236 | -0.888545 | 0.361047 | 0.916595 |
| Final_Standing | -0.274699 | 1.000000 | -0.213166 | -0.801612 | 0.640538 | -0.459954 | -0.252544 | -0.344243 | -0.334162 | 0.088824 | ... | -0.372299 | -0.390545 | -0.290483 | 0.186951 | -0.145467 | 0.107300 | -0.326219 | 0.341624 | -0.014340 | -0.338664 |
| Games_Played | 0.781980 | -0.213166 | 1.000000 | 0.426088 | 0.395069 | 0.240833 | 0.866048 | 0.472664 | 0.548309 | -0.434139 | ... | 0.258410 | 0.786178 | 0.931249 | 0.445805 | 0.720693 | 0.464576 | 0.816380 | -0.632648 | 0.354613 | 0.714013 |
| Games_Won | 0.315051 | -0.801612 | 0.426088 | 1.000000 | -0.476318 | 0.575551 | 0.450319 | 0.537469 | 0.434675 | 0.013223 | ... | 0.536561 | 0.491225 | 0.510420 | -0.010287 | 0.234248 | 0.051055 | 0.424356 | -0.356374 | 0.142152 | 0.408746 |
| Games_Lost | 0.340399 | 0.640538 | 0.395069 | -0.476318 | 1.000000 | -0.226362 | 0.327896 | 0.051015 | 0.167462 | -0.159084 | ... | -0.179071 | 0.181630 | 0.286110 | 0.588136 | 0.412657 | 0.513484 | 0.258669 | -0.141217 | 0.223401 | 0.232301 |
| Runs_Scored | 0.240707 | -0.459954 | 0.240833 | 0.575551 | -0.226362 | 1.000000 | 0.468626 | 0.757546 | 0.616222 | 0.132385 | ... | -0.069142 | 0.298107 | 0.219980 | 0.411270 | 0.375124 | 0.272330 | 0.226619 | -0.197410 | 0.352792 | 0.261518 |
| At_Bats | 0.766968 | -0.252544 | 0.866048 | 0.450319 | 0.327896 | 0.468626 | 1.000000 | 0.742428 | 0.732729 | -0.366264 | ... | 0.121713 | 0.755724 | 0.859493 | 0.603994 | 0.759212 | 0.495107 | 0.740341 | -0.631324 | 0.461246 | 0.717702 |
| Hits | 0.432009 | -0.344243 | 0.472664 | 0.537469 | 0.051015 | 0.757546 | 0.742428 | 1.000000 | 0.785740 | 0.022698 | ... | 0.029691 | 0.459474 | 0.456512 | 0.643577 | 0.486024 | 0.366194 | 0.342793 | -0.363469 | 0.405722 | 0.446763 |
| Doubles | 0.678671 | -0.334162 | 0.548309 | 0.434675 | 0.167462 | 0.616222 | 0.732729 | 0.785740 | 1.000000 | -0.220792 | ... | 0.004934 | 0.623705 | 0.513107 | 0.617070 | 0.650222 | 0.422942 | 0.560636 | -0.581621 | 0.366630 | 0.661537 |
| Triples | -0.631983 | 0.088824 | -0.434139 | 0.013223 | -0.159084 | 0.132385 | -0.366264 | 0.022698 | -0.220792 | 1.000000 | ... | 0.051694 | -0.551959 | -0.414802 | -0.068064 | -0.551999 | -0.198915 | -0.565666 | 0.635171 | -0.191162 | -0.559284 |
| Home_Runs | 0.823918 | -0.347434 | 0.693135 | 0.450463 | 0.179379 | 0.512871 | 0.757164 | 0.541482 | 0.660915 | -0.532944 | ... | 0.045617 | 0.803792 | 0.662230 | 0.439857 | 0.886188 | 0.499955 | 0.778469 | -0.764482 | 0.433687 | 0.792145 |
| Walks | 0.461719 | -0.305729 | 0.483255 | 0.490243 | 0.082485 | 0.539667 | 0.490052 | 0.447216 | 0.490117 | -0.199808 | ... | 0.118947 | 0.508623 | 0.487373 | 0.366321 | 0.525874 | 0.545336 | 0.447933 | -0.434163 | 0.378155 | 0.478526 |
| Strike_Outs | 0.895568 | -0.200366 | 0.813299 | 0.302005 | 0.402110 | 0.154026 | 0.719682 | 0.259131 | 0.530377 | -0.566149 | ... | 0.180225 | 0.825208 | 0.772673 | 0.312014 | 0.822780 | 0.431565 | 0.920380 | -0.745329 | 0.229025 | 0.797131 |
| Stolen_Bases | -0.192581 | -0.082728 | -0.063741 | 0.048425 | -0.137192 | 0.080899 | -0.176351 | -0.131850 | -0.175577 | 0.304038 | ... | 0.029598 | -0.143803 | -0.051383 | -0.205197 | -0.289459 | -0.221423 | -0.080675 | 0.296674 | -0.342434 | -0.286610 |
| Runs_Against | 0.253098 | 0.355105 | 0.223545 | -0.242123 | 0.600748 | 0.439777 | 0.376841 | 0.438892 | 0.442703 | 0.019831 | ... | -0.500155 | 0.139218 | 0.097777 | 0.794766 | 0.479119 | 0.551323 | 0.126411 | -0.066519 | 0.379589 | 0.161943 |
| Earned_Runs | 0.595115 | 0.168519 | 0.498714 | -0.046020 | 0.619575 | 0.431541 | 0.636378 | 0.578177 | 0.663727 | -0.257239 | ... | -0.342406 | 0.478491 | 0.385637 | 0.868230 | 0.762225 | 0.673869 | 0.429312 | -0.448664 | 0.484250 | 0.524027 |
| Earned_Run_Average | 0.387392 | 0.305546 | 0.189396 | -0.274268 | 0.500262 | 0.350832 | 0.356915 | 0.401457 | 0.485021 | -0.207851 | ... | -0.566322 | 0.243548 | 0.066119 | 0.732008 | 0.579502 | 0.521991 | 0.176583 | -0.300424 | 0.404286 | 0.312411 |
| Complete_Games | -0.907837 | 0.185047 | -0.661237 | -0.136375 | -0.286972 | -0.127545 | -0.658504 | -0.312849 | -0.578901 | 0.719088 | ... | 0.085239 | -0.842178 | -0.620513 | -0.353932 | -0.802188 | -0.406879 | -0.784269 | 0.838758 | -0.321861 | -0.810418 |
| Shutout | 0.097021 | -0.372299 | 0.258410 | 0.536561 | -0.179071 | -0.069142 | 0.121713 | 0.029691 | 0.004934 | 0.051694 | ... | 1.000000 | 0.180824 | 0.336574 | -0.271823 | -0.070023 | -0.093790 | 0.246854 | -0.136851 | -0.090311 | 0.182850 |
| Saves | 0.892868 | -0.390545 | 0.786178 | 0.491225 | 0.181630 | 0.298107 | 0.755724 | 0.459474 | 0.623705 | -0.551959 | ... | 0.180824 | 1.000000 | 0.787709 | 0.337349 | 0.785848 | 0.424078 | 0.844665 | -0.814831 | 0.349019 | 0.834160 |
| Infield_Put_Outs | 0.743554 | -0.290483 | 0.931249 | 0.510420 | 0.286110 | 0.219980 | 0.859493 | 0.456512 | 0.513107 | -0.414802 | ... | 0.336574 | 0.787709 | 1.000000 | 0.358175 | 0.650998 | 0.404665 | 0.802176 | -0.618994 | 0.325047 | 0.697420 |
| Hits_Allowed | 0.422593 | 0.186951 | 0.445805 | -0.010287 | 0.588136 | 0.411270 | 0.603994 | 0.643577 | 0.617070 | -0.068064 | ... | -0.271823 | 0.337349 | 0.358175 | 1.000000 | 0.550584 | 0.469565 | 0.238806 | -0.254858 | 0.454944 | 0.364834 |
| Home_Run_Allowed | 0.865201 | -0.145467 | 0.720693 | 0.234248 | 0.412657 | 0.375124 | 0.759212 | 0.486024 | 0.650222 | -0.551999 | ... | -0.070023 | 0.785848 | 0.650998 | 0.550584 | 1.000000 | 0.557030 | 0.781631 | -0.772648 | 0.449520 | 0.805396 |
| Walks_Allowed | 0.458994 | 0.107300 | 0.464576 | 0.051055 | 0.513484 | 0.272330 | 0.495107 | 0.366194 | 0.422942 | -0.198915 | ... | -0.093790 | 0.424078 | 0.404665 | 0.469565 | 0.557030 | 1.000000 | 0.420674 | -0.355897 | 0.489310 | 0.410029 |
| Strikeouts_Allowed | 0.883236 | -0.326219 | 0.816380 | 0.424356 | 0.258669 | 0.226619 | 0.740341 | 0.342793 | 0.560636 | -0.565666 | ... | 0.246854 | 0.844665 | 0.802176 | 0.238806 | 0.781631 | 0.420674 | 1.000000 | -0.769876 | 0.215895 | 0.811301 |
| Errors | -0.888545 | 0.341624 | -0.632648 | -0.356374 | -0.141217 | -0.197410 | -0.631324 | -0.363469 | -0.581621 | 0.635171 | ... | -0.136851 | -0.814831 | -0.618994 | -0.254858 | -0.772648 | -0.355897 | -0.769876 | 1.000000 | -0.346504 | -0.933260 |
| Double_Plays | 0.361047 | -0.014340 | 0.354613 | 0.142152 | 0.223401 | 0.352792 | 0.461246 | 0.405722 | 0.366630 | -0.191162 | ... | -0.090311 | 0.349019 | 0.325047 | 0.454944 | 0.449520 | 0.489310 | 0.215895 | -0.346504 | 1.000000 | 0.393810 |
| Fielding_Percentage | 0.916595 | -0.338664 | 0.714013 | 0.408746 | 0.232301 | 0.261518 | 0.717702 | 0.446763 | 0.661537 | -0.559284 | ... | 0.182850 | 0.834160 | 0.697420 | 0.364834 | 0.805396 | 0.410029 | 0.811301 | -0.933260 | 0.393810 | 1.000000 |
28 rows × 28 columns
As mentioned in Section 2: Data Exploration, there is no data for Strike_Out, Stolen_Base, Double_Plays for this Period so we will remove it from Period 1 Correlation Calculation to determine style of Play
data_baseball_1=data_baseball[data_baseball['Year']<1920]
data_baseball_1=data_baseball_1.drop(['Franchise ','Final_Standing','Games_Played','Games_Lost','Strike_Outs','Stolen_Bases','Double_Plays'], axis='columns', inplace=False)
data_baseball_1.head(5)
| Year | Team | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1871 | BS1 | 20 | 401 | 1372 | 426 | 70 | 37 | 3 | 60 | ... | 1 | 3 | 828 | 367 | 2 | 42 | 23 | 225 | 0.83 | Boston Red Stockings |
| 1 | 1871 | CH1 | 19 | 302 | 1196 | 323 | 52 | 21 | 10 | 60 | ... | 0 | 1 | 753 | 308 | 6 | 28 | 22 | 218 | 0.82 | Chicago White Stockings |
| 2 | 1871 | CL1 | 10 | 249 | 1186 | 328 | 35 | 40 | 7 | 26 | ... | 0 | 0 | 762 | 346 | 13 | 53 | 34 | 223 | 0.81 | Cleveland Forest Citys |
| 3 | 1871 | FW1 | 7 | 137 | 746 | 178 | 19 | 8 | 2 | 33 | ... | 1 | 0 | 507 | 261 | 5 | 21 | 17 | 163 | 0.80 | Fort Wayne Kekiongas |
| 4 | 1871 | NY2 | 16 | 302 | 1404 | 403 | 43 | 21 | 1 | 33 | ... | 1 | 0 | 879 | 373 | 7 | 42 | 22 | 227 | 0.83 | New York Mutuals |
5 rows × 24 columns
data_baseball_1.info()
<class 'pandas.core.frame.DataFrame'> Index: 703 entries, 0 to 702 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 703 non-null int64 1 Team 703 non-null object 2 Games_Won 703 non-null int64 3 Runs_Scored 703 non-null int64 4 At_Bats 703 non-null int64 5 Hits 703 non-null int64 6 Doubles 703 non-null int64 7 Triples 703 non-null int64 8 Home_Runs 703 non-null int64 9 Walks 703 non-null int64 10 Runs_Against 703 non-null int64 11 Earned_Runs 703 non-null int64 12 Earned_Run_Average 703 non-null float64 13 Complete_Games 703 non-null int64 14 Shutout 703 non-null int64 15 Saves 703 non-null int64 16 Infield_Put_Outs 703 non-null int64 17 Hits_Allowed 703 non-null int64 18 Home_Run_Allowed 703 non-null int64 19 Walks_Allowed 703 non-null int64 20 Strikeouts_Allowed 703 non-null int64 21 Errors 703 non-null int64 22 Fielding_Percentage 703 non-null float64 23 Team_Name 703 non-null object dtypes: float64(2), int64(20), object(2) memory usage: 137.3+ KB
data_baseball_1.corr(method='spearman',numeric_only=True)
| Year | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | Runs_Against | ... | Complete_Games | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | 0.553107 | 0.107374 | 0.725220 | 0.551325 | 0.503047 | 0.376502 | 0.290436 | 0.589186 | 0.151924 | ... | 0.071586 | 0.648692 | 0.760274 | 0.849696 | 0.557399 | 0.286971 | 0.589335 | 0.624425 | -0.612668 | 0.939038 |
| Games_Won | 0.553107 | 1.000000 | 0.609485 | 0.635757 | 0.700850 | 0.664230 | 0.567015 | 0.486563 | 0.621796 | 0.029506 | ... | 0.374354 | 0.691273 | 0.592741 | 0.671723 | 0.269425 | 0.233305 | 0.358365 | 0.616015 | -0.304828 | 0.661324 |
| Runs_Scored | 0.107374 | 0.609485 | 1.000000 | 0.399262 | 0.758286 | 0.672725 | 0.710060 | 0.723392 | 0.632252 | 0.587389 | ... | 0.550255 | 0.058647 | 0.212410 | 0.233997 | 0.490005 | 0.558994 | 0.466616 | 0.216825 | 0.300881 | 0.136364 |
| At_Bats | 0.725220 | 0.635757 | 0.399262 | 1.000000 | 0.762945 | 0.585992 | 0.534613 | 0.411831 | 0.654047 | 0.379451 | ... | 0.413958 | 0.552729 | 0.585054 | 0.922037 | 0.686776 | 0.396297 | 0.665916 | 0.666052 | -0.264247 | 0.716559 |
| Hits | 0.551325 | 0.700850 | 0.758286 | 0.762945 | 1.000000 | 0.790539 | 0.731218 | 0.645367 | 0.693361 | 0.531624 | ... | 0.437852 | 0.317448 | 0.505305 | 0.625373 | 0.726588 | 0.529074 | 0.620909 | 0.400532 | -0.124616 | 0.558742 |
| Doubles | 0.503047 | 0.664230 | 0.672725 | 0.585992 | 0.790539 | 1.000000 | 0.579910 | 0.629602 | 0.615677 | 0.410463 | ... | 0.327829 | 0.339055 | 0.485716 | 0.529576 | 0.566666 | 0.472718 | 0.509094 | 0.437182 | -0.123000 | 0.511970 |
| Triples | 0.376502 | 0.567015 | 0.710060 | 0.534613 | 0.731218 | 0.579910 | 1.000000 | 0.614107 | 0.564460 | 0.495821 | ... | 0.425980 | 0.207482 | 0.355576 | 0.435812 | 0.538966 | 0.462890 | 0.519937 | 0.333154 | 0.042551 | 0.373548 |
| Home_Runs | 0.290436 | 0.486563 | 0.723392 | 0.411831 | 0.645367 | 0.629602 | 0.614107 | 1.000000 | 0.543065 | 0.545591 | ... | 0.409652 | 0.128568 | 0.279401 | 0.309350 | 0.489662 | 0.736548 | 0.500267 | 0.308907 | 0.141524 | 0.272651 |
| Walks | 0.589186 | 0.621796 | 0.632252 | 0.654047 | 0.693361 | 0.615677 | 0.564460 | 0.543065 | 1.000000 | 0.512543 | ... | 0.330158 | 0.332029 | 0.571187 | 0.640106 | 0.628274 | 0.530233 | 0.759536 | 0.475956 | -0.122523 | 0.561203 |
| Runs_Against | 0.151924 | 0.029506 | 0.587389 | 0.379451 | 0.531624 | 0.410463 | 0.495821 | 0.545591 | 0.512543 | 1.000000 | ... | 0.537418 | -0.247444 | 0.038756 | 0.200833 | 0.779524 | 0.726317 | 0.658181 | 0.092933 | 0.464460 | 0.036327 |
| Earned_Runs | 0.393301 | 0.148052 | 0.555897 | 0.522213 | 0.645120 | 0.512410 | 0.540154 | 0.565534 | 0.613799 | 0.925430 | ... | 0.457378 | -0.092804 | 0.236398 | 0.380888 | 0.898591 | 0.740567 | 0.756082 | 0.170635 | 0.182002 | 0.293383 |
| Earned_Run_Average | -0.174014 | -0.378637 | 0.251076 | -0.087315 | 0.160704 | 0.071842 | 0.159267 | 0.248767 | 0.137329 | 0.726633 | ... | 0.140010 | -0.633984 | -0.241579 | -0.259810 | 0.451578 | 0.472831 | 0.300310 | -0.348346 | 0.340818 | -0.292012 |
| Complete_Games | 0.071586 | 0.374354 | 0.550255 | 0.413958 | 0.437852 | 0.327829 | 0.425980 | 0.409652 | 0.330158 | 0.537418 | ... | 1.000000 | 0.171669 | -0.114333 | 0.291003 | 0.427003 | 0.419993 | 0.312632 | 0.291597 | 0.337331 | 0.136295 |
| Shutout | 0.648692 | 0.691273 | 0.058647 | 0.552729 | 0.317448 | 0.339055 | 0.207482 | 0.128568 | 0.332029 | -0.247444 | ... | 0.171669 | 1.000000 | 0.564568 | 0.707844 | 0.092116 | -0.019406 | 0.214127 | 0.675962 | -0.489450 | 0.736280 |
| Saves | 0.760274 | 0.592741 | 0.212410 | 0.585054 | 0.505305 | 0.485716 | 0.355576 | 0.279401 | 0.571187 | 0.038756 | ... | -0.114333 | 0.564568 | 1.000000 | 0.702414 | 0.372415 | 0.204592 | 0.496895 | 0.540862 | -0.500914 | 0.745408 |
| Infield_Put_Outs | 0.849696 | 0.671723 | 0.233997 | 0.922037 | 0.625373 | 0.529576 | 0.435812 | 0.309350 | 0.640106 | 0.200833 | ... | 0.291003 | 0.707844 | 0.702414 | 1.000000 | 0.573955 | 0.293877 | 0.614096 | 0.766730 | -0.436475 | 0.849280 |
| Hits_Allowed | 0.557399 | 0.269425 | 0.490005 | 0.686776 | 0.726588 | 0.566666 | 0.538966 | 0.489662 | 0.628274 | 0.779524 | ... | 0.427003 | 0.092116 | 0.372415 | 0.573955 | 1.000000 | 0.629721 | 0.701395 | 0.267022 | -0.006595 | 0.467880 |
| Home_Run_Allowed | 0.286971 | 0.233305 | 0.558994 | 0.396297 | 0.529074 | 0.472718 | 0.462890 | 0.736548 | 0.530233 | 0.726317 | ... | 0.419993 | -0.019406 | 0.204592 | 0.293877 | 0.629721 | 1.000000 | 0.551652 | 0.234406 | 0.207336 | 0.228728 |
| Walks_Allowed | 0.589335 | 0.358365 | 0.466616 | 0.665916 | 0.620909 | 0.509094 | 0.519937 | 0.500267 | 0.759536 | 0.658181 | ... | 0.312632 | 0.214127 | 0.496895 | 0.614096 | 0.701395 | 0.551652 | 1.000000 | 0.446855 | -0.051443 | 0.506870 |
| Strikeouts_Allowed | 0.624425 | 0.616015 | 0.216825 | 0.666052 | 0.400532 | 0.437182 | 0.333154 | 0.308907 | 0.475956 | 0.092933 | ... | 0.291597 | 0.675962 | 0.540862 | 0.766730 | 0.267022 | 0.234406 | 0.446855 | 1.000000 | -0.241414 | 0.629396 |
| Errors | -0.612668 | -0.304828 | 0.300881 | -0.264247 | -0.124616 | -0.123000 | 0.042551 | 0.141524 | -0.122523 | 0.464460 | ... | 0.337331 | -0.489450 | -0.500914 | -0.436475 | -0.006595 | 0.207336 | -0.051443 | -0.241414 | 1.000000 | -0.697764 |
| Fielding_Percentage | 0.939038 | 0.661324 | 0.136364 | 0.716559 | 0.558742 | 0.511970 | 0.373548 | 0.272651 | 0.561203 | 0.036327 | ... | 0.136295 | 0.736280 | 0.745408 | 0.849280 | 0.467880 | 0.228728 | 0.506870 | 0.629396 | -0.697764 | 1.000000 |
22 rows × 22 columns
df1_corr = data_baseball_1.corr(method='spearman',numeric_only=True)
print(df1_corr['Games_Won'].sort_values(ascending=False))
Games_Won 1.000000 Hits 0.700850 Shutout 0.691273 Infield_Put_Outs 0.671723 Doubles 0.664230 Fielding_Percentage 0.661324 At_Bats 0.635757 Walks 0.621796 Strikeouts_Allowed 0.616015 Runs_Scored 0.609485 Saves 0.592741 Triples 0.567015 Year 0.553107 Home_Runs 0.486563 Complete_Games 0.374354 Walks_Allowed 0.358365 Hits_Allowed 0.269425 Home_Run_Allowed 0.233305 Earned_Runs 0.148052 Runs_Against 0.029506 Errors -0.304828 Earned_Run_Average -0.378637 Name: Games_Won, dtype: float64
Top 8 variables that have the highest correlation with Games Won are Hits (0.76), Shutout (0.69), Infield_Put_Outs (0.67), Fielding Percentage (0.66), Doubles (0.66), At Bats (0.63), Walks (0.62), Strikeouts Allowed (0.62). But after checking the correlation between these variables, we eliminate:
We then pick the next 4 variables that have high correlations with the dependent variable which are Runs_Scored (0.61), Saves (0.59), Triples (0.57), Home_Runs (0.72). After checking the correlation between these 4 variables with the first 4 varibles, we then eliminate Runs_Scored as it has high correlations with Triples (0.71) & Home_Runs (0.72).
Finally, we pick Complete_Games (0.37) to replace Runs_Scored, and it does not have high correlation with any other variables in the candidate list
Now we have the top 8 variables that do not have strong correlation with each other
From the look of the histograms, Saves, Shutout and Home_Runs are right skewed, Complete_Games is left skewed, we will remove these variables from the top 8
The final variables that have the highest correlation:
Because there are 3 offensive variables and 1 defensive variables, the style of play for this period is Offensive
offensive = ['Doubles','Triples', 'Walks', 'Home_Runs']
for i in offensive:
sns.scatterplot(data_baseball_1,x=i,y='Games_Won')
plt.show()
The first 3 variables have strong linear relationship with Games Won. Home_Runs seems not to have a strong linear relationship with Games Won
defensive = ['Shutout','Saves', 'Strikeouts_Allowed','Complete_Games']
for i in defensive:
sns.scatterplot(data_baseball_1,x=i,y='Games_Won')
plt.show()
All 4 variables have linear relationships with Games Won
data_baseball_1[['Doubles','Triples', 'Walks', 'Home_Runs', 'Shutout','Saves', 'Strikeouts_Allowed','Complete_Games']].hist(bins=20, figsize=(16,10))
plt.show()
df2=data_baseball[(data_baseball['Year']>1920) & (data_baseball['Year']<=1959)]
df2.head(5)
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 719 | 1921 | BOS | BOS | 5 | 154 | 75 | 79 | 668 | 5206 | 1440 | ... | 5 | 4092 | 1521 | 53 | 452 | 446 | 157 | 151.0 | 0.97 | Boston Red Sox |
| 720 | 1921 | BRO | LAD | 5 | 152 | 77 | 75 | 667 | 5263 | 1476 | ... | 12 | 4089 | 1556 | 46 | 361 | 471 | 232 | 142.0 | 0.96 | Brooklyn Robins |
| 721 | 1921 | BSN | ATL | 4 | 153 | 79 | 74 | 721 | 5385 | 1561 | ... | 12 | 4155 | 1488 | 54 | 420 | 382 | 199 | 122.0 | 0.96 | Boston Braves |
| 722 | 1921 | CHA | CHW | 7 | 154 | 62 | 92 | 683 | 5329 | 1509 | ... | 9 | 4095 | 1603 | 52 | 549 | 392 | 199 | 155.0 | 0.96 | Chicago White Sox |
| 723 | 1921 | CHN | CHC | 7 | 153 | 64 | 89 | 668 | 5321 | 1553 | ... | 7 | 4089 | 1605 | 67 | 409 | 441 | 166 | 129.0 | 0.97 | Chicago Cubs |
5 rows × 31 columns
We will remove Strike_Outs Column as it does not have enough data for this period
data_baseball_2=df2.drop(['Franchise ','Final_Standing','Games_Played','Games_Lost', 'Strike_Outs'], axis='columns', inplace=False)
data_baseball_2.head()
| Year | Team | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 719 | 1921 | BOS | 75 | 668 | 5206 | 1440 | 248 | 69 | 17 | 428 | ... | 5 | 4092 | 1521 | 53 | 452 | 446 | 157 | 151.0 | 0.97 | Boston Red Sox |
| 720 | 1921 | BRO | 77 | 667 | 5263 | 1476 | 209 | 85 | 59 | 325 | ... | 12 | 4089 | 1556 | 46 | 361 | 471 | 232 | 142.0 | 0.96 | Brooklyn Robins |
| 721 | 1921 | BSN | 79 | 721 | 5385 | 1561 | 209 | 100 | 61 | 377 | ... | 12 | 4155 | 1488 | 54 | 420 | 382 | 199 | 122.0 | 0.96 | Boston Braves |
| 722 | 1921 | CHA | 62 | 683 | 5329 | 1509 | 242 | 82 | 35 | 445 | ... | 9 | 4095 | 1603 | 52 | 549 | 392 | 199 | 155.0 | 0.96 | Chicago White Sox |
| 723 | 1921 | CHN | 64 | 668 | 5321 | 1553 | 234 | 56 | 37 | 343 | ... | 7 | 4089 | 1605 | 67 | 409 | 441 | 166 | 129.0 | 0.97 | Chicago Cubs |
5 rows × 26 columns
data_baseball_2.corr(method='spearman',numeric_only=True)
| Year | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | Stolen_Bases | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | 0.003614 | -0.286215 | -0.244734 | -0.626035 | -0.537748 | -0.646351 | 0.539521 | 0.374539 | -0.520503 | ... | 0.225234 | 0.523644 | 0.190051 | -0.599826 | 0.664850 | 0.386546 | 0.732192 | -0.701638 | 0.285147 | 0.555904 |
| Games_Won | 0.003614 | 1.000000 | 0.646088 | 0.244756 | 0.406018 | 0.242822 | 0.236794 | 0.337866 | 0.352178 | 0.165264 | ... | 0.519055 | 0.400856 | 0.394018 | -0.464102 | -0.100038 | -0.238954 | 0.344999 | -0.329646 | 0.048373 | 0.312221 |
| Runs_Scored | -0.286215 | 0.646088 | 1.000000 | 0.499992 | 0.784664 | 0.614051 | 0.458482 | 0.357141 | 0.403326 | 0.295935 | ... | -0.001616 | 0.180740 | 0.060217 | 0.151269 | 0.049497 | -0.031496 | 0.042451 | 0.041555 | 0.020241 | -0.073445 |
| At_Bats | -0.244734 | 0.244756 | 0.499992 | 1.000000 | 0.694599 | 0.537271 | 0.305162 | 0.075031 | -0.071665 | 0.103420 | ... | -0.059148 | -0.042488 | 0.406688 | 0.293746 | -0.047121 | -0.085733 | -0.020439 | 0.179172 | -0.003322 | -0.099794 |
| Hits | -0.626035 | 0.406018 | 0.784664 | 0.694599 | 1.000000 | 0.751915 | 0.622882 | -0.029016 | -0.059981 | 0.390425 | ... | -0.127578 | -0.112140 | 0.031054 | 0.438466 | -0.247003 | -0.257385 | -0.271597 | 0.356765 | -0.100321 | -0.293233 |
| Doubles | -0.537748 | 0.242822 | 0.614051 | 0.537271 | 0.751915 | 1.000000 | 0.495230 | -0.108846 | 0.041819 | 0.336566 | ... | -0.174151 | -0.158322 | -0.037458 | 0.447412 | -0.222693 | -0.120245 | -0.258685 | 0.339409 | -0.099956 | -0.280776 |
| Triples | -0.646351 | 0.236794 | 0.458482 | 0.305162 | 0.622882 | 0.495230 | 1.000000 | -0.309150 | -0.086452 | 0.479116 | ... | -0.134950 | -0.184666 | -0.052955 | 0.370779 | -0.414141 | -0.240850 | -0.364113 | 0.444178 | -0.120395 | -0.373057 |
| Home_Runs | 0.539521 | 0.337866 | 0.357141 | 0.075031 | -0.029016 | -0.108846 | -0.309150 | 1.000000 | 0.370917 | -0.335661 | ... | 0.097738 | 0.512814 | 0.095705 | -0.314916 | 0.690838 | 0.232331 | 0.622989 | -0.494533 | 0.192360 | 0.344251 |
| Walks | 0.374539 | 0.352178 | 0.403326 | -0.071665 | -0.059981 | 0.041819 | -0.086452 | 0.370917 | 1.000000 | -0.052033 | ... | 0.139975 | 0.357541 | 0.126279 | -0.269716 | 0.291906 | 0.434837 | 0.367357 | -0.368320 | 0.167018 | 0.270386 |
| Stolen_Bases | -0.520503 | 0.165264 | 0.295935 | 0.103420 | 0.390425 | 0.336566 | 0.479116 | -0.335661 | -0.052033 | 1.000000 | ... | -0.050139 | -0.186946 | 0.044268 | 0.239410 | -0.372661 | -0.183922 | -0.303546 | 0.378207 | -0.096533 | -0.304591 |
| Runs_Against | -0.302324 | -0.615487 | 0.090931 | 0.166083 | 0.236514 | 0.298859 | 0.180093 | -0.114021 | -0.074796 | 0.079060 | ... | -0.740877 | -0.301108 | -0.426160 | 0.830230 | 0.177854 | 0.310423 | -0.399037 | 0.489830 | -0.022776 | -0.474234 |
| Earned_Runs | -0.156537 | -0.602482 | 0.079347 | 0.140458 | 0.164270 | 0.243605 | 0.077195 | -0.004446 | -0.001954 | 0.000526 | ... | -0.717601 | -0.221262 | -0.408754 | 0.752064 | 0.312966 | 0.391139 | -0.281659 | 0.322173 | 0.042096 | -0.333654 |
| Earned_Run_Average | -0.167747 | -0.616380 | 0.068404 | 0.094835 | 0.153582 | 0.235603 | 0.078800 | -0.016359 | -0.016420 | -0.005377 | ... | -0.721786 | -0.234578 | -0.484619 | 0.748579 | 0.301385 | 0.380743 | -0.302801 | 0.330902 | 0.031486 | -0.345943 |
| Complete_Games | -0.577240 | 0.341673 | 0.258781 | 0.159468 | 0.387713 | 0.327732 | 0.420646 | -0.343445 | -0.103390 | 0.388276 | ... | 0.206408 | -0.513033 | 0.074903 | 0.065056 | -0.563590 | -0.399312 | -0.372821 | 0.264284 | -0.175052 | -0.179839 |
| Shutout | 0.225234 | 0.519055 | -0.001616 | -0.059148 | -0.127578 | -0.174151 | -0.134950 | 0.097738 | 0.139975 | -0.050139 | ... | 1.000000 | 0.205206 | 0.357931 | -0.621933 | -0.145910 | -0.200622 | 0.333962 | -0.408837 | 0.053920 | 0.373022 |
| Saves | 0.523644 | 0.400856 | 0.180740 | -0.042488 | -0.112140 | -0.158322 | -0.184666 | 0.512814 | 0.357541 | -0.186946 | ... | 0.205206 | 1.000000 | 0.219986 | -0.438558 | 0.383228 | 0.177389 | 0.586044 | -0.477646 | 0.199759 | 0.381824 |
| Infield_Put_Outs | 0.190051 | 0.394018 | 0.060217 | 0.406688 | 0.031054 | -0.037458 | -0.052955 | 0.095705 | 0.126279 | 0.044268 | ... | 0.357931 | 0.219986 | 1.000000 | -0.300374 | -0.032738 | -0.074637 | 0.335975 | -0.229324 | 0.088070 | 0.272851 |
| Hits_Allowed | -0.599826 | -0.464102 | 0.151269 | 0.293746 | 0.438466 | 0.447412 | 0.370779 | -0.314916 | -0.269716 | 0.239410 | ... | -0.621933 | -0.438558 | -0.300374 | 1.000000 | -0.140169 | -0.091030 | -0.620452 | 0.627175 | -0.106445 | -0.525222 |
| Home_Run_Allowed | 0.664850 | -0.100038 | 0.049497 | -0.047121 | -0.247003 | -0.222693 | -0.414141 | 0.690838 | 0.291906 | -0.372661 | ... | -0.145910 | 0.383228 | -0.032738 | -0.140169 | 1.000000 | 0.360044 | 0.517713 | -0.457692 | 0.226421 | 0.290784 |
| Walks_Allowed | 0.386546 | -0.238954 | -0.031496 | -0.085733 | -0.257385 | -0.120245 | -0.240850 | 0.232331 | 0.434837 | -0.183922 | ... | -0.200622 | 0.177389 | -0.074637 | -0.091030 | 0.360044 | 1.000000 | 0.270360 | -0.188106 | 0.335863 | 0.082026 |
| Strikeouts_Allowed | 0.732192 | 0.344999 | 0.042451 | -0.020439 | -0.271597 | -0.258685 | -0.364113 | 0.622989 | 0.367357 | -0.303546 | ... | 0.333962 | 0.586044 | 0.335975 | -0.620452 | 0.517713 | 0.270360 | 1.000000 | -0.632651 | 0.109443 | 0.493691 |
| Errors | -0.701638 | -0.329646 | 0.041555 | 0.179172 | 0.356765 | 0.339409 | 0.444178 | -0.494533 | -0.368320 | 0.378207 | ... | -0.408837 | -0.477646 | -0.229324 | 0.627175 | -0.457692 | -0.188106 | -0.632651 | 1.000000 | -0.285144 | -0.828624 |
| Double_Plays | 0.285147 | 0.048373 | 0.020241 | -0.003322 | -0.100321 | -0.099956 | -0.120395 | 0.192360 | 0.167018 | -0.096533 | ... | 0.053920 | 0.199759 | 0.088070 | -0.106445 | 0.226421 | 0.335863 | 0.109443 | -0.285144 | 1.000000 | 0.259383 |
| Fielding_Percentage | 0.555904 | 0.312221 | -0.073445 | -0.099794 | -0.293233 | -0.280776 | -0.373057 | 0.344251 | 0.270386 | -0.304591 | ... | 0.373022 | 0.381824 | 0.272851 | -0.525222 | 0.290784 | 0.082026 | 0.493691 | -0.828624 | 0.259383 | 1.000000 |
24 rows × 24 columns
df2_corr=data_baseball_2.corr(method='spearman',numeric_only=True)
print((df2_corr['Games_Won'].sort_values(ascending=False)))
Games_Won 1.000000 Runs_Scored 0.646088 Shutout 0.519055 Hits 0.406018 Saves 0.400856 Infield_Put_Outs 0.394018 Walks 0.352178 Strikeouts_Allowed 0.344999 Complete_Games 0.341673 Home_Runs 0.337866 Fielding_Percentage 0.312221 At_Bats 0.244756 Doubles 0.242822 Triples 0.236794 Stolen_Bases 0.165264 Double_Plays 0.048373 Year 0.003614 Home_Run_Allowed -0.100038 Walks_Allowed -0.238954 Errors -0.329646 Hits_Allowed -0.464102 Earned_Runs -0.602482 Runs_Against -0.615487 Earned_Run_Average -0.616380 Name: Games_Won, dtype: float64
Top 8 variables that have strong correlation with Games Won are : Earned_Runs (-0.60), Runs_Against (-0.62), and Earned_Run_Average (-0.62), Runs_Scored (0.65), Shutout (0.52), Hits (0.41), Hit_Allowed (-0.46)
Multicollinearity:
After eliminating 3 variables above (Earned_Runs, Earned_Run_Average, Runs_Against) from the list, we then pick Hits (0.41), Saves (0.4), Infield_Put_Outs (0.39), Walks (0.35), Strikeouts_Allowed (0.34)
Top 8 candidates:
However, from the look of the histogram, we can see that Shutout, Saves and Strikeouts_Allowed are right skewed, so we'll remove these 3 from the top 8
The final top candidates:
The style of play for this Period is Defensive
top8 = ['Runs_Scored', 'Walks','Shutout', 'Saves', 'Infield_Put_Outs', 'Hits_Allowed', 'Strikeouts_Allowed', 'Complete_Games' ]
for i in top8:
sns.scatterplot(data_baseball_2,x=i,y='Games_Won')
plt.show()
All 8 variables have linear relationship with Games Won
data_baseball_2[['Runs_Scored', 'Walks','Shutout', 'Saves', 'Infield_Put_Outs', 'Hits_Allowed', 'Strikeouts_Allowed', 'Complete_Games']].hist(bins=20, figsize=(16,10))
plt.show()
df3=data_baseball[(data_baseball['Year']>=1960) & (data_baseball['Year']<=1989)]
df3.head(5)
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1343 | 1960 | BAL | BAL | 2 | 154 | 89 | 65 | 682 | 5170 | 1307 | ... | 22 | 4125 | 1222 | 117 | 552 | 785 | 107 | 172.0 | 0.98 | Baltimore Orioles |
| 1344 | 1960 | BOS | BOS | 7 | 154 | 65 | 89 | 658 | 5215 | 1359 | ... | 23 | 4083 | 1440 | 127 | 580 | 767 | 140 | 156.0 | 0.97 | Boston Red Sox |
| 1345 | 1960 | CHA | CHW | 3 | 154 | 87 | 67 | 741 | 5191 | 1402 | ... | 26 | 4143 | 1338 | 127 | 533 | 695 | 109 | 175.0 | 0.98 | Chicago White Sox |
| 1346 | 1960 | CHN | CHC | 7 | 156 | 60 | 94 | 634 | 5311 | 1293 | ... | 25 | 4206 | 1393 | 152 | 565 | 805 | 143 | 133.0 | 0.97 | Chicago Cubs |
| 1347 | 1960 | CIN | CIN | 6 | 154 | 67 | 87 | 640 | 5289 | 1324 | ... | 35 | 4170 | 1417 | 134 | 442 | 740 | 125 | 155.0 | 0.97 | Cincinnati Reds |
5 rows × 31 columns
data_baseball_3=df3.drop(['Franchise ','Final_Standing','Games_Played','Games_Lost'], axis='columns', inplace=False)
data_baseball_3.head()
| Year | Team | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1343 | 1960 | BAL | 89 | 682 | 5170 | 1307 | 206 | 33 | 123 | 596 | ... | 22 | 4125 | 1222 | 117 | 552 | 785 | 107 | 172.0 | 0.98 | Baltimore Orioles |
| 1344 | 1960 | BOS | 65 | 658 | 5215 | 1359 | 234 | 32 | 124 | 570 | ... | 23 | 4083 | 1440 | 127 | 580 | 767 | 140 | 156.0 | 0.97 | Boston Red Sox |
| 1345 | 1960 | CHA | 87 | 741 | 5191 | 1402 | 242 | 38 | 112 | 567 | ... | 26 | 4143 | 1338 | 127 | 533 | 695 | 109 | 175.0 | 0.98 | Chicago White Sox |
| 1346 | 1960 | CHN | 60 | 634 | 5311 | 1293 | 213 | 48 | 119 | 531 | ... | 25 | 4206 | 1393 | 152 | 565 | 805 | 143 | 133.0 | 0.97 | Chicago Cubs |
| 1347 | 1960 | CIN | 67 | 640 | 5289 | 1324 | 230 | 40 | 140 | 512 | ... | 35 | 4170 | 1417 | 134 | 442 | 740 | 125 | 155.0 | 0.97 | Cincinnati Reds |
5 rows × 27 columns
data_baseball_3.corr(method='spearman',numeric_only=True)
| Year | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | Strike_Outs | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | -0.029954 | 0.154436 | 0.156098 | 0.242286 | 0.452462 | -0.144376 | -0.016947 | 0.033468 | -0.134195 | ... | -0.226960 | 0.287252 | -0.017494 | 0.239870 | -0.036910 | 0.023661 | -0.118609 | -0.238096 | -0.030629 | 0.218578 |
| Games_Won | -0.029954 | 1.000000 | 0.628775 | 0.352010 | 0.493893 | 0.355417 | 0.209896 | 0.412268 | 0.312605 | 0.022689 | ... | 0.458363 | 0.481143 | 0.435602 | -0.229717 | -0.054089 | -0.298456 | 0.252206 | -0.213304 | 0.083860 | 0.233488 |
| Runs_Scored | 0.154436 | 0.628775 | 1.000000 | 0.611436 | 0.810858 | 0.655596 | 0.243196 | 0.687291 | 0.481312 | -0.025570 | ... | -0.083663 | 0.370890 | 0.202812 | 0.335193 | 0.430790 | 0.075815 | 0.065151 | -0.078533 | 0.243089 | 0.114494 |
| At_Bats | 0.156098 | 0.352010 | 0.611436 | 1.000000 | 0.812430 | 0.562374 | 0.285263 | 0.353044 | 0.085523 | 0.031112 | ... | -0.021389 | 0.237613 | 0.597540 | 0.422793 | 0.252931 | 0.103416 | 0.151501 | 0.082299 | 0.236492 | 0.029262 |
| Hits | 0.242286 | 0.493893 | 0.810858 | 0.812430 | 1.000000 | 0.711894 | 0.337892 | 0.386281 | 0.164012 | -0.214934 | ... | -0.047773 | 0.311167 | 0.273947 | 0.427227 | 0.248137 | 0.060071 | -0.007018 | -0.004355 | 0.286569 | 0.068733 |
| Doubles | 0.452462 | 0.355417 | 0.655596 | 0.562374 | 0.711894 | 1.000000 | 0.284817 | 0.336022 | 0.182651 | -0.049827 | ... | -0.094101 | 0.366483 | 0.160835 | 0.387455 | 0.234213 | 0.112794 | 0.030630 | -0.081352 | 0.182083 | 0.124019 |
| Triples | -0.144376 | 0.209896 | 0.243196 | 0.285263 | 0.337892 | 0.284817 | 1.000000 | -0.071509 | -0.029809 | 0.021917 | ... | 0.050174 | 0.090665 | 0.147100 | 0.123128 | -0.007936 | -0.006610 | -0.021619 | 0.128806 | 0.110123 | -0.071509 |
| Home_Runs | -0.016947 | 0.412268 | 0.687291 | 0.353044 | 0.386281 | 0.336022 | -0.071509 | 1.000000 | 0.349417 | 0.263787 | ... | -0.102523 | 0.269982 | 0.081249 | 0.217285 | 0.588692 | 0.088878 | 0.198703 | -0.088802 | 0.136797 | 0.126191 |
| Walks | 0.033468 | 0.312605 | 0.481312 | 0.085523 | 0.164012 | 0.182651 | -0.029809 | 0.349417 | 1.000000 | 0.143155 | ... | -0.049912 | 0.128970 | 0.175078 | 0.204290 | 0.272592 | 0.221483 | 0.117903 | 0.009953 | 0.148122 | 0.051578 |
| Strike_Outs | -0.134195 | 0.022689 | -0.025570 | 0.031112 | -0.214934 | -0.049827 | 0.021917 | 0.263787 | 0.143155 | 1.000000 | ... | 0.061687 | 0.165886 | 0.214791 | -0.041198 | 0.211949 | 0.187549 | 0.466592 | 0.149596 | -0.031628 | -0.038730 |
| Stolen_Bases | 0.530798 | 0.182627 | 0.171012 | 0.121200 | 0.217381 | 0.327068 | 0.152363 | -0.154718 | 0.151028 | -0.003202 | ... | -0.009799 | 0.264422 | 0.212512 | 0.050405 | -0.139751 | 0.042666 | -0.035594 | -0.025078 | -0.057478 | 0.040451 |
| Runs_Against | 0.170104 | -0.393825 | 0.299163 | 0.302378 | 0.307657 | 0.322691 | 0.066245 | 0.287172 | 0.221993 | 0.075442 | ... | -0.595011 | -0.062678 | -0.152622 | 0.831224 | 0.669774 | 0.582940 | -0.110281 | 0.321539 | 0.254893 | -0.197592 |
| Earned_Runs | 0.207108 | -0.358201 | 0.330691 | 0.319761 | 0.334683 | 0.360182 | 0.063310 | 0.312797 | 0.231876 | 0.064893 | ... | -0.598940 | -0.029873 | -0.149418 | 0.831591 | 0.695448 | 0.566545 | -0.110451 | 0.211893 | 0.266836 | -0.112742 |
| Earned_Run_Average | 0.226628 | -0.483987 | 0.232658 | 0.169822 | 0.225245 | 0.264171 | -0.002305 | 0.236369 | 0.134042 | -0.041424 | ... | -0.684679 | -0.114241 | -0.339867 | 0.751748 | 0.635865 | 0.494677 | -0.229263 | 0.130949 | 0.184900 | -0.119452 |
| Complete_Games | -0.542194 | 0.298612 | 0.069532 | 0.013862 | 0.046969 | -0.170531 | 0.101545 | 0.077548 | 0.067852 | -0.114150 | ... | 0.448311 | -0.472783 | 0.111350 | -0.184202 | -0.080215 | -0.162110 | 0.050935 | 0.082085 | 0.065949 | -0.046324 |
| Shutout | -0.226960 | 0.458363 | -0.083663 | -0.021389 | -0.047773 | -0.094101 | 0.050174 | -0.102523 | -0.049912 | 0.061687 | ... | 1.000000 | 0.018652 | 0.303477 | -0.473136 | -0.413671 | -0.311853 | 0.251257 | -0.071790 | -0.032082 | 0.112544 |
| Saves | 0.287252 | 0.481143 | 0.370890 | 0.237613 | 0.311167 | 0.366483 | 0.090665 | 0.269982 | 0.128970 | 0.165886 | ... | 0.018652 | 1.000000 | 0.218254 | -0.016984 | 0.086123 | -0.046638 | 0.208578 | -0.157064 | 0.047909 | 0.167216 |
| Infield_Put_Outs | -0.017494 | 0.435602 | 0.202812 | 0.597540 | 0.273947 | 0.160835 | 0.147100 | 0.081249 | 0.175078 | 0.214791 | ... | 0.303477 | 0.218254 | 1.000000 | 0.013079 | -0.090857 | -0.023655 | 0.352814 | 0.091700 | 0.072292 | 0.061301 |
| Hits_Allowed | 0.239870 | -0.229717 | 0.335193 | 0.422793 | 0.427227 | 0.387455 | 0.123128 | 0.217285 | 0.204290 | -0.041198 | ... | -0.473136 | -0.016984 | 0.013079 | 1.000000 | 0.464405 | 0.275620 | -0.242255 | 0.300674 | 0.344285 | -0.132303 |
| Home_Run_Allowed | -0.036910 | -0.054089 | 0.430790 | 0.252931 | 0.248137 | 0.234213 | -0.007936 | 0.588692 | 0.272592 | 0.211949 | ... | -0.413671 | 0.086123 | -0.090857 | 0.464405 | 1.000000 | 0.332911 | 0.079159 | 0.025220 | 0.170257 | 0.011785 |
| Walks_Allowed | 0.023661 | -0.298456 | 0.075815 | 0.103416 | 0.060071 | 0.112794 | -0.006610 | 0.088878 | 0.221483 | 0.187549 | ... | -0.311853 | -0.046638 | -0.023655 | 0.275620 | 0.332911 | 1.000000 | 0.122814 | 0.228565 | 0.232633 | -0.171252 |
| Strikeouts_Allowed | -0.118609 | 0.252206 | 0.065151 | 0.151501 | -0.007018 | 0.030630 | -0.021619 | 0.198703 | 0.117903 | 0.466592 | ... | 0.251257 | 0.208578 | 0.352814 | -0.242255 | 0.079159 | 0.122814 | 1.000000 | 0.018221 | -0.252862 | 0.005303 |
| Errors | -0.238096 | -0.213304 | -0.078533 | 0.082299 | -0.004355 | -0.081352 | 0.128806 | -0.088802 | 0.009953 | 0.149596 | ... | -0.071790 | -0.157064 | 0.091700 | 0.300674 | 0.025220 | 0.228565 | 0.018221 | 1.000000 | 0.064740 | -0.767833 |
| Double_Plays | -0.030629 | 0.083860 | 0.243089 | 0.236492 | 0.286569 | 0.182083 | 0.110123 | 0.136797 | 0.148122 | -0.031628 | ... | -0.032082 | 0.047909 | 0.072292 | 0.344285 | 0.170257 | 0.232633 | -0.252862 | 0.064740 | 1.000000 | 0.039074 |
| Fielding_Percentage | 0.218578 | 0.233488 | 0.114494 | 0.029262 | 0.068733 | 0.124019 | -0.071509 | 0.126191 | 0.051578 | -0.038730 | ... | 0.112544 | 0.167216 | 0.061301 | -0.132303 | 0.011785 | -0.171252 | 0.005303 | -0.767833 | 0.039074 | 1.000000 |
25 rows × 25 columns
df3_corr = data_baseball_3.corr(method='spearman',numeric_only=True)
print(df3_corr['Games_Won'].sort_values(ascending=False))
Games_Won 1.000000 Runs_Scored 0.628775 Hits 0.493893 Saves 0.481143 Shutout 0.458363 Infield_Put_Outs 0.435602 Home_Runs 0.412268 Doubles 0.355417 At_Bats 0.352010 Walks 0.312605 Complete_Games 0.298612 Strikeouts_Allowed 0.252206 Fielding_Percentage 0.233488 Triples 0.209896 Stolen_Bases 0.182627 Double_Plays 0.083860 Strike_Outs 0.022689 Year -0.029954 Home_Run_Allowed -0.054089 Errors -0.213304 Hits_Allowed -0.229717 Walks_Allowed -0.298456 Earned_Runs -0.358201 Runs_Against -0.393825 Earned_Run_Average -0.483987 Name: Games_Won, dtype: float64
Top 8 variables that have the highest correlation with Games Won are Runs_Scored (0.63), Hits (0.49), Saves (0.48), Earned_Run_Average (-0.48), Shutout (0.46), Infield_Put_Out (0.44), Home_Runs (0.41), Runs_Against (0.39). But after checking the correlation between these variables, we eliminate:
We then pick the next 2 variables that have high correlations with the dependent variable which are Doubles (0.36) and At_Bats (0.35). These 2 variables do not have high correlation with any other variables in the candidate list
Top 8 variables that have strong correlation with Games Won are :
However, At_Bats and Infield_Put_Outs do not have a linear relationship with Games Won (according the the scatter matrixes below), and Shutout and Saves' histograms do not have a symmetric share. So we'll remove these 4 varibles from top 8
Top FINAL variables that have strong correlation with Games Won are :
The style of play for this Period is Offensive
top_p3 = ['Runs_Scored','Home_Runs','Doubles','At_Bats','Saves','Earned_Run_Average','Shutout','Infield_Put_Outs']
for i in top_p3:
sns.scatterplot(data_baseball_3,x=i,y='Games_Won')
plt.show()
All variables have linear relationship with Games Won except At_Bats and Infield_Put_Outs
data_baseball_3[['Runs_Scored','Home_Runs','Doubles','At_Bats','Saves','Earned_Run_Average','Shutout','Infield_Put_Outs']].hist(bins=20, figsize=(16,10))
plt.show()
df4=data_baseball[(data_baseball['Year']>=1990) & (data_baseball['Year']<=2010)]
df4.head(5)
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2047 | 1990 | ATL | ATL | 6 | 162 | 65 | 97 | 682 | 5504 | 1376 | ... | 30 | 4287 | 1527 | 128 | 579 | 938 | 158 | 133.0 | 0.97 | Atlanta Braves |
| 2048 | 1990 | BAL | BAL | 5 | 161 | 76 | 85 | 669 | 5410 | 1328 | ... | 43 | 4305 | 1445 | 161 | 537 | 776 | 91 | 151.0 | 0.98 | Baltimore Orioles |
| 2049 | 1990 | BOS | BOS | 1 | 162 | 88 | 74 | 699 | 5516 | 1502 | ... | 44 | 4326 | 1439 | 92 | 519 | 997 | 123 | 154.0 | 0.98 | Boston Red Sox |
| 2050 | 1990 | CAL | ANA | 4 | 162 | 80 | 82 | 690 | 5570 | 1448 | ... | 42 | 4362 | 1482 | 106 | 544 | 944 | 140 | 186.0 | 0.97 | California Angels |
| 2051 | 1990 | CHA | CHW | 2 | 162 | 94 | 68 | 682 | 5402 | 1393 | ... | 68 | 4347 | 1313 | 106 | 548 | 914 | 124 | 169.0 | 0.98 | Chicago White Sox |
5 rows × 31 columns
data_baseball_4=df4.drop(['Franchise ','Final_Standing','Games_Played','Games_Lost'], axis='columns', inplace=False)
data_baseball_4.head()
| Year | Team | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2047 | 1990 | ATL | 65 | 682 | 5504 | 1376 | 263 | 26 | 162 | 473 | ... | 30 | 4287 | 1527 | 128 | 579 | 938 | 158 | 133.0 | 0.97 | Atlanta Braves |
| 2048 | 1990 | BAL | 76 | 669 | 5410 | 1328 | 234 | 22 | 132 | 660 | ... | 43 | 4305 | 1445 | 161 | 537 | 776 | 91 | 151.0 | 0.98 | Baltimore Orioles |
| 2049 | 1990 | BOS | 88 | 699 | 5516 | 1502 | 298 | 31 | 106 | 598 | ... | 44 | 4326 | 1439 | 92 | 519 | 997 | 123 | 154.0 | 0.98 | Boston Red Sox |
| 2050 | 1990 | CAL | 80 | 690 | 5570 | 1448 | 237 | 27 | 147 | 566 | ... | 42 | 4362 | 1482 | 106 | 544 | 944 | 140 | 186.0 | 0.97 | California Angels |
| 2051 | 1990 | CHA | 94 | 682 | 5402 | 1393 | 251 | 44 | 106 | 478 | ... | 68 | 4347 | 1313 | 106 | 548 | 914 | 124 | 169.0 | 0.98 | Chicago White Sox |
5 rows × 27 columns
data_baseball_4.corr(method='spearman',numeric_only=True)
| Year | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | Strike_Outs | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | 0.117956 | 0.223444 | 0.256397 | 0.235970 | 0.480090 | -0.052715 | 0.322269 | 0.058329 | 0.497652 | ... | 0.079600 | 0.002486 | 0.052886 | 0.222745 | 0.387507 | 0.051609 | 0.515073 | -0.307064 | 0.166983 | 0.674834 |
| Games_Won | 0.117956 | 1.000000 | 0.600772 | 0.298512 | 0.461585 | 0.339706 | 0.035645 | 0.426131 | 0.507042 | 0.052432 | ... | 0.507288 | 0.684815 | 0.554787 | -0.238813 | -0.149116 | -0.214491 | 0.453809 | -0.180135 | -0.013667 | 0.312278 |
| Runs_Scored | 0.223444 | 0.600772 | 1.000000 | 0.673056 | 0.803931 | 0.619613 | 0.033000 | 0.764597 | 0.609977 | 0.200393 | ... | 0.007605 | 0.261026 | 0.238576 | 0.362677 | 0.418509 | 0.142702 | 0.277957 | -0.046926 | 0.217805 | 0.223405 |
| At_Bats | 0.256397 | 0.298512 | 0.673056 | 1.000000 | 0.874977 | 0.594066 | 0.094263 | 0.457170 | 0.258118 | 0.162638 | ... | -0.032167 | 0.124826 | 0.449639 | 0.523387 | 0.421662 | 0.237404 | 0.235969 | 0.003482 | 0.298820 | 0.231437 |
| Hits | 0.235970 | 0.461585 | 0.803931 | 0.874977 | 1.000000 | 0.640528 | 0.132141 | 0.483970 | 0.321615 | 0.028714 | ... | 0.017088 | 0.223530 | 0.291060 | 0.452449 | 0.399882 | 0.165526 | 0.246038 | -0.042426 | 0.269213 | 0.246081 |
| Doubles | 0.480090 | 0.339706 | 0.619613 | 0.594066 | 0.640528 | 1.000000 | 0.065931 | 0.464299 | 0.303788 | 0.310375 | ... | 0.061033 | 0.115909 | 0.187431 | 0.342565 | 0.369003 | 0.142777 | 0.380712 | -0.090420 | 0.220698 | 0.377380 |
| Triples | -0.052715 | 0.035645 | 0.033000 | 0.094263 | 0.132141 | 0.065931 | 1.000000 | -0.164444 | -0.088981 | 0.004636 | ... | 0.056934 | 0.067381 | 0.066655 | 0.044319 | 0.004001 | 0.050013 | -0.034742 | 0.053635 | 0.008206 | -0.014156 |
| Home_Runs | 0.322269 | 0.426131 | 0.764597 | 0.457170 | 0.483970 | 0.464299 | -0.164444 | 1.000000 | 0.459200 | 0.398992 | ... | -0.043797 | 0.180270 | 0.148960 | 0.290357 | 0.446190 | 0.175666 | 0.327087 | -0.057283 | 0.171135 | 0.248714 |
| Walks | 0.058329 | 0.507042 | 0.609977 | 0.258118 | 0.321615 | 0.303788 | -0.088981 | 0.459200 | 1.000000 | 0.283824 | ... | 0.101953 | 0.243844 | 0.327108 | 0.122227 | 0.152545 | 0.164598 | 0.282681 | 0.009993 | 0.119820 | 0.087310 |
| Strike_Outs | 0.497652 | 0.052432 | 0.200393 | 0.162638 | 0.028714 | 0.310375 | 0.004636 | 0.398992 | 0.283824 | 1.000000 | ... | 0.011215 | 0.007178 | 0.161116 | 0.273001 | 0.350987 | 0.258911 | 0.434378 | 0.120098 | 0.143034 | 0.190811 |
| Stolen_Bases | -0.235142 | 0.127314 | 0.006765 | -0.050920 | 0.012280 | -0.168362 | 0.209106 | -0.177638 | 0.014833 | -0.079186 | ... | 0.089870 | 0.153227 | 0.156520 | -0.060393 | -0.058808 | -0.011643 | -0.001613 | 0.157228 | -0.105856 | -0.203871 |
| Runs_Against | 0.208688 | -0.380634 | 0.336086 | 0.442486 | 0.371873 | 0.322748 | 0.024142 | 0.334318 | 0.120164 | 0.315951 | ... | -0.471475 | -0.292816 | -0.200913 | 0.873788 | 0.745458 | 0.564842 | -0.080155 | 0.303241 | 0.366978 | -0.074135 |
| Earned_Runs | 0.257822 | -0.351739 | 0.360851 | 0.461868 | 0.398964 | 0.348836 | 0.024992 | 0.356028 | 0.130781 | 0.317202 | ... | -0.462647 | -0.278017 | -0.191966 | 0.869671 | 0.773979 | 0.556827 | -0.062359 | 0.198534 | 0.372593 | 0.014360 |
| Earned_Run_Average | 0.133612 | -0.553397 | 0.195758 | 0.241618 | 0.197874 | 0.170389 | -0.048430 | 0.234893 | -0.024736 | 0.137474 | ... | -0.628254 | -0.456560 | -0.457021 | 0.705535 | 0.667019 | 0.434150 | -0.266576 | 0.089091 | 0.223443 | -0.084416 |
| Complete_Games | -0.599190 | 0.068889 | -0.108591 | -0.176729 | -0.143907 | -0.268171 | 0.026889 | -0.198182 | 0.023349 | -0.351041 | ... | 0.205699 | -0.070588 | 0.049852 | -0.219996 | -0.320861 | -0.142088 | -0.254914 | 0.133735 | -0.057114 | -0.365955 |
| Shutout | 0.079600 | 0.507288 | 0.007605 | -0.032167 | 0.017088 | 0.061033 | 0.056934 | -0.043797 | 0.101953 | 0.011215 | ... | 1.000000 | 0.375344 | 0.378517 | -0.358188 | -0.345941 | -0.213151 | 0.341663 | -0.097060 | -0.008776 | 0.217200 |
| Saves | 0.002486 | 0.684815 | 0.261026 | 0.124826 | 0.223530 | 0.115909 | 0.067381 | 0.180270 | 0.243844 | 0.007178 | ... | 0.375344 | 1.000000 | 0.456024 | -0.165830 | -0.152615 | -0.142756 | 0.271737 | -0.036862 | -0.047768 | 0.149132 |
| Infield_Put_Outs | 0.052886 | 0.554787 | 0.238576 | 0.449639 | 0.291060 | 0.187431 | 0.066655 | 0.148960 | 0.327108 | 0.161116 | ... | 0.378517 | 0.456024 | 1.000000 | -0.032067 | -0.095983 | 0.001279 | 0.387216 | 0.007927 | 0.100881 | 0.194005 |
| Hits_Allowed | 0.222745 | -0.238813 | 0.362677 | 0.523387 | 0.452449 | 0.342565 | 0.044319 | 0.290357 | 0.122227 | 0.273001 | ... | -0.358188 | -0.165830 | -0.032067 | 1.000000 | 0.638502 | 0.332399 | -0.118659 | 0.264807 | 0.463829 | -0.005140 |
| Home_Run_Allowed | 0.387507 | -0.149116 | 0.418509 | 0.421662 | 0.399882 | 0.369003 | 0.004001 | 0.446190 | 0.152545 | 0.350987 | ... | -0.345941 | -0.152615 | -0.095983 | 0.638502 | 1.000000 | 0.319042 | 0.111271 | 0.021363 | 0.234086 | 0.195984 |
| Walks_Allowed | 0.051609 | -0.214491 | 0.142702 | 0.237404 | 0.165526 | 0.142777 | 0.050013 | 0.175666 | 0.164598 | 0.258911 | ... | -0.213151 | -0.142756 | 0.001279 | 0.332399 | 0.319042 | 1.000000 | 0.097742 | 0.261717 | 0.321311 | -0.104418 |
| Strikeouts_Allowed | 0.515073 | 0.453809 | 0.277957 | 0.235969 | 0.246038 | 0.380712 | -0.034742 | 0.327087 | 0.282681 | 0.434378 | ... | 0.341663 | 0.271737 | 0.387216 | -0.118659 | 0.111271 | 0.097742 | 1.000000 | -0.090884 | -0.124430 | 0.372185 |
| Errors | -0.307064 | -0.180135 | -0.046926 | 0.003482 | -0.042426 | -0.090420 | 0.053635 | -0.057283 | 0.009993 | 0.120098 | ... | -0.097060 | -0.036862 | 0.007927 | 0.264807 | 0.021363 | 0.261717 | -0.090884 | 1.000000 | 0.069721 | -0.733841 |
| Double_Plays | 0.166983 | -0.013667 | 0.217805 | 0.298820 | 0.269213 | 0.220698 | 0.008206 | 0.171135 | 0.119820 | 0.143034 | ... | -0.008776 | -0.047768 | 0.100881 | 0.463829 | 0.234086 | 0.321311 | -0.124430 | 0.069721 | 1.000000 | 0.143587 |
| Fielding_Percentage | 0.674834 | 0.312278 | 0.223405 | 0.231437 | 0.246081 | 0.377380 | -0.014156 | 0.248714 | 0.087310 | 0.190811 | ... | 0.217200 | 0.149132 | 0.194005 | -0.005140 | 0.195984 | -0.104418 | 0.372185 | -0.733841 | 0.143587 | 1.000000 |
25 rows × 25 columns
df4_corr=data_baseball_4.corr(method='spearman',numeric_only=True)
print(df4_corr['Games_Won'].sort_values(ascending=False))
Games_Won 1.000000 Saves 0.684815 Runs_Scored 0.600772 Infield_Put_Outs 0.554787 Shutout 0.507288 Walks 0.507042 Hits 0.461585 Strikeouts_Allowed 0.453809 Home_Runs 0.426131 Doubles 0.339706 Fielding_Percentage 0.312278 At_Bats 0.298512 Stolen_Bases 0.127314 Year 0.117956 Complete_Games 0.068889 Strike_Outs 0.052432 Triples 0.035645 Double_Plays -0.013667 Home_Run_Allowed -0.149116 Errors -0.180135 Walks_Allowed -0.214491 Hits_Allowed -0.238813 Earned_Runs -0.351739 Runs_Against -0.380634 Earned_Run_Average -0.553397 Name: Games_Won, dtype: float64
Top 8 variables that have the highest correlation with Games Won are Earned_Run_Average (-0.55), Saves (0.68), Runs_Scored (0.60), Infield_Put_Outs (0.55), Shutout (0.50), Walks (0.50), Hits (0.46), Strikeouts Allowed (0.45).
After checking the normality and linear relationship (qq plot and scatter plot), we remove Infield_Put_Outs as it not normally distributed and does not have a linear relationship with Games Won. The next candidate could have been Home_Runs but it has a strong correlation with Runs_Score (0.76), so Doubles would be the last candidate.
The final 8 variables that have the highest correlation after considering their correlation with each other are
All 8 candidate variables are not dependent of each other. Because there are 4 defensive variables out of 8, the style of play for this period is Balanced
candidate = ['Earned_Run_Average', 'Shutout', 'Saves','Infield_Put_Outs','Strikeouts_Allowed', 'Walks','Runs_Scored','Hits','Strikeouts_Allowed','Doubles']
for i in candidate:
sns.scatterplot(data_baseball_4,x=i,y='Games_Won')
plt.show()
Even though Infield_Put_Outs seems to have a moderate relationship with Games Won, when we do a scatterplot to check the linear relationship, these 2 variables do not have a linear relationship visually. We might need to eliminate it from the candidate list to build the model
candidate = ['Earned_Run_Average', 'Shutout', 'Saves', 'Infield_Put_Outs','Strikeouts_Allowed', 'Walks','Runs_Scored','Hits','Strikeouts_Allowed','Doubles']
for i in candidate:
qqplot(data_baseball_4[i],line='s')
plt.title(label=i)
All the independent variables have normal distribution (or close to normal distribution) except Infield_Put_Outs
data_baseball_4.describe()
| Year | Games_Won | Runs_Scored | At_Bats | Hits | Doubles | Triples | Home_Runs | Walks | Strike_Outs | ... | Shutout | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | ... | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 | 608.000000 |
| mean | 2000.259868 | 79.445724 | 747.483553 | 5446.587171 | 1441.445724 | 280.891447 | 30.838816 | 161.365132 | 536.782895 | 1017.419408 | ... | 8.276316 | 40.047697 | 4254.504934 | 1441.442434 | 161.365132 | 536.781250 | 1017.412829 | 108.421053 | 148.613487 | 0.980138 |
| std | 5.994086 | 12.282923 | 93.733980 | 363.314246 | 117.725690 | 35.682677 | 8.829355 | 37.691082 | 78.410122 | 136.028295 | ... | 3.856909 | 7.784962 | 283.179215 | 124.154087 | 30.538358 | 73.369502 | 135.513189 | 18.933518 | 20.410780 | 0.004776 |
| min | 1990.000000 | 43.000000 | 466.000000 | 3856.000000 | 963.000000 | 159.000000 | 11.000000 | 68.000000 | 319.000000 | 568.000000 | ... | 0.000000 | 20.000000 | 2952.000000 | 929.000000 | 76.000000 | 288.000000 | 560.000000 | 57.000000 | 82.000000 | 0.970000 |
| 25% | 1995.000000 | 71.000000 | 686.000000 | 5478.750000 | 1398.000000 | 262.750000 | 24.000000 | 135.000000 | 486.750000 | 928.000000 | ... | 6.000000 | 35.000000 | 4300.000000 | 1385.000000 | 141.000000 | 489.000000 | 935.500000 | 95.000000 | 135.000000 | 0.980000 |
| 50% | 2000.000000 | 79.500000 | 747.000000 | 5533.500000 | 1452.500000 | 284.000000 | 30.000000 | 160.500000 | 533.000000 | 1028.000000 | ... | 8.000000 | 40.000000 | 4328.000000 | 1452.000000 | 162.000000 | 537.500000 | 1021.000000 | 107.000000 | 149.000000 | 0.980000 |
| 75% | 2005.000000 | 88.000000 | 807.000000 | 5601.000000 | 1510.250000 | 304.000000 | 36.250000 | 187.000000 | 589.250000 | 1105.250000 | ... | 11.000000 | 45.000000 | 4358.000000 | 1521.250000 | 182.000000 | 582.000000 | 1106.250000 | 122.000000 | 162.000000 | 0.983000 |
| max | 2010.000000 | 116.000000 | 1009.000000 | 5781.000000 | 1684.000000 | 376.000000 | 61.000000 | 264.000000 | 775.000000 | 1529.000000 | ... | 24.000000 | 68.000000 | 4467.000000 | 1734.000000 | 241.000000 | 784.000000 | 1404.000000 | 173.000000 | 204.000000 | 0.989000 |
8 rows × 25 columns
data_baseball_4[['Earned_Run_Average', 'Shutout', 'Saves','Strikeouts_Allowed', 'Walks','Runs_Scored','Hits','Doubles']].hist(bins=20, figsize=(16,10))
plt.show()
data_baseball_4[['Earned_Run_Average', 'Shutout', 'Saves','Walks','Runs_Scored','Hits','Strikeouts_Allowed','Doubles']].mode()
| Earned_Run_Average | Shutout | Saves | Walks | Runs_Scored | Hits | Strikeouts_Allowed | Doubles | |
|---|---|---|---|---|---|---|---|---|
| 0 | 3.87 | 8.0 | 42.0 | 500 | 747.0 | 1408.0 | 997.0 | 294.0 |
| 1 | 4.13 | NaN | NaN | 511 | 772.0 | NaN | NaN | NaN |
| 2 | 4.21 | NaN | NaN | 519 | NaN | NaN | NaN | NaN |
| 3 | 4.23 | NaN | NaN | 526 | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | 536 | NaN | NaN | NaN | NaN |
data_baseball_4[['Earned_Run_Average', 'Shutout', 'Saves','Walks','Runs_Scored','Doubles','Hits','Strikeouts_Allowed']].var(axis=0)
Earned_Run_Average 0.299431 Shutout 14.875748 Saves 60.605629 Walks 6148.147186 Runs_Scored 8786.059037 Doubles 1273.253436 Hits 13859.338071 Strikeouts_Allowed 18363.824349 dtype: float64
Comment on the histogram plots:
candidate2 = ['Earned_Run_Average', 'Shutout', 'Saves','Walks','Runs_Scored','Hits','Strikeouts_Allowed','Doubles']
for i in candidate2:
plt.boxplot(data_baseball_4[i])
plt.title(label=i)
plt.figure(figsize=(1,1))
plt.show()
<Figure size 100x100 with 0 Axes>
<Figure size 100x100 with 0 Axes>
<Figure size 100x100 with 0 Axes>
<Figure size 100x100 with 0 Axes>
<Figure size 100x100 with 0 Axes>
<Figure size 100x100 with 0 Axes>
<Figure size 100x100 with 0 Axes>
<Figure size 100x100 with 0 Axes>
Comment on the box plots:
Are there many outliers? yes, all 8 variables have outliers, however, Earned_Run_Average seems to be the variable that has the least outliers out of 8 variables
| Period | # of candidate variables | # of Offensive Measurement | # of Offensive Measurement | Style of Play |
|---|---|---|---|---|
| Period 1 | 7 | 4 | 3 | Offensive |
| Period 2 | 4 | 1 | 3 | Defensive |
| Period 3 | 4 | 3 | 1 | Offensive |
| Period 4 | 7 | 3 | 4 | Balanced |
x = data_baseball_4[['Year','Games_Won','Earned_Run_Average', 'Shutout', 'Saves', 'Strikeouts_Allowed', 'Walks', 'Runs_Scored', 'Hits', 'Doubles']]
y = data_baseball_4['Games_Won']
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1)
x_train.corr(method='spearman')
| Year | Games_Won | Earned_Run_Average | Shutout | Saves | Strikeouts_Allowed | Walks | Runs_Scored | Hits | Doubles | |
|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | 0.117416 | 0.108228 | 0.076916 | 0.040540 | 0.499671 | 0.029491 | 0.187975 | 0.223441 | 0.457771 |
| Games_Won | 0.117416 | 1.000000 | -0.536063 | 0.506401 | 0.694183 | 0.446054 | 0.501467 | 0.618414 | 0.493718 | 0.344472 |
| Earned_Run_Average | 0.108228 | -0.536063 | 1.000000 | -0.614966 | -0.446337 | -0.305579 | 0.000537 | 0.190898 | 0.177012 | 0.151402 |
| Shutout | 0.076916 | 0.506401 | -0.614966 | 1.000000 | 0.390981 | 0.374718 | 0.051362 | 0.006828 | 0.038098 | 0.063106 |
| Saves | 0.040540 | 0.694183 | -0.446337 | 0.390981 | 1.000000 | 0.282102 | 0.244940 | 0.284601 | 0.230298 | 0.137051 |
| Strikeouts_Allowed | 0.499671 | 0.446054 | -0.305579 | 0.374718 | 0.282102 | 1.000000 | 0.235478 | 0.239279 | 0.232723 | 0.383407 |
| Walks | 0.029491 | 0.501467 | 0.000537 | 0.051362 | 0.244940 | 0.235478 | 1.000000 | 0.627916 | 0.351331 | 0.286358 |
| Runs_Scored | 0.187975 | 0.618414 | 0.190898 | 0.006828 | 0.284601 | 0.239279 | 0.627916 | 1.000000 | 0.804903 | 0.597991 |
| Hits | 0.223441 | 0.493718 | 0.177012 | 0.038098 | 0.230298 | 0.232723 | 0.351331 | 0.804903 | 1.000000 | 0.649448 |
| Doubles | 0.457771 | 0.344472 | 0.151402 | 0.063106 | 0.137051 | 0.383407 | 0.286358 | 0.597991 | 0.649448 | 1.000000 |
x_train_corr=x_train.corr(method='spearman')
print ((x_train_corr['Games_Won']).sort_values(ascending=False))
Games_Won 1.000000 Saves 0.694183 Runs_Scored 0.618414 Shutout 0.506401 Walks 0.501467 Hits 0.493718 Strikeouts_Allowed 0.446054 Doubles 0.344472 Year 0.117416 Earned_Run_Average -0.536063 Name: Games_Won, dtype: float64
lm1 = smf.ols(formula='Games_Won ~ Saves',data = x_train).fit()
lm1.summary()
| Dep. Variable: | Games_Won | R-squared: | 0.505 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.504 |
| Method: | Least Squares | F-statistic: | 431.5 |
| Date: | Tue, 19 Dec 2023 | Prob (F-statistic): | 1.41e-66 |
| Time: | 20:47:10 | Log-Likelihood: | -1523.4 |
| No. Observations: | 425 | AIC: | 3051. |
| Df Residuals: | 423 | BIC: | 3059. |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 33.9552 | 2.222 | 15.279 | 0.000 | 29.587 | 38.323 |
| Saves | 1.1349 | 0.055 | 20.773 | 0.000 | 1.028 | 1.242 |
| Omnibus: | 4.195 | Durbin-Watson: | 1.966 |
|---|---|---|---|
| Prob(Omnibus): | 0.123 | Jarque-Bera (JB): | 3.672 |
| Skew: | 0.150 | Prob(JB): | 0.159 |
| Kurtosis: | 2.657 | Cond. No. | 213. |
Conclusion: we will keep Saves in the model and add 1 more independent variable to see if the model is stronger
lm2 = smf.ols(formula='Games_Won ~ Saves + Runs_Scored',data = x_train).fit()
lm2.summary()
| Dep. Variable: | Games_Won | R-squared: | 0.714 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.713 |
| Method: | Least Squares | F-statistic: | 527.5 |
| Date: | Tue, 19 Dec 2023 | Prob (F-statistic): | 1.57e-115 |
| Time: | 20:47:11 | Log-Likelihood: | -1406.6 |
| No. Observations: | 425 | AIC: | 2819. |
| Df Residuals: | 422 | BIC: | 2831. |
| Df Model: | 2 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | -3.2963 | 2.710 | -1.216 | 0.225 | -8.623 | 2.031 |
| Saves | 0.9075 | 0.044 | 20.852 | 0.000 | 0.822 | 0.993 |
| Runs_Scored | 0.0617 | 0.004 | 17.584 | 0.000 | 0.055 | 0.069 |
| Omnibus: | 15.765 | Durbin-Watson: | 2.104 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 16.599 |
| Skew: | 0.444 | Prob(JB): | 0.000249 |
| Kurtosis: | 3.384 | Cond. No. | 6.37e+03 |
Conclusion: We will try to supress the intercept to see if the model is better or worse
lm3 = smf.ols(formula='Games_Won ~ Saves + Runs_Scored - 1',data = x_train).fit()
lm3.summary()
| Dep. Variable: | Games_Won | R-squared (uncentered): | 0.993 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared (uncentered): | 0.993 |
| Method: | Least Squares | F-statistic: | 3.072e+04 |
| Date: | Tue, 19 Dec 2023 | Prob (F-statistic): | 0.00 |
| Time: | 20:47:12 | Log-Likelihood: | -1407.3 |
| No. Observations: | 425 | AIC: | 2819. |
| Df Residuals: | 423 | BIC: | 2827. |
| Df Model: | 2 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Saves | 0.8888 | 0.041 | 21.811 | 0.000 | 0.809 | 0.969 |
| Runs_Scored | 0.0584 | 0.002 | 26.654 | 0.000 | 0.054 | 0.063 |
| Omnibus: | 14.798 | Durbin-Watson: | 2.098 |
|---|---|---|---|
| Prob(Omnibus): | 0.001 | Jarque-Bera (JB): | 15.403 |
| Skew: | 0.436 | Prob(JB): | 0.000452 |
| Kurtosis: | 3.333 | Cond. No. | 95.9 |
Conclusion: We will try to add another independent vairable to the model and add intercept back to the model to see if it gets better
lm4 = smf.ols(formula='Games_Won ~ Saves + Runs_Scored + Earned_Run_Average',data = x_train).fit()
lm4.summary()
| Dep. Variable: | Games_Won | R-squared: | 0.922 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.921 |
| Method: | Least Squares | F-statistic: | 1650. |
| Date: | Tue, 19 Dec 2023 | Prob (F-statistic): | 2.84e-232 |
| Time: | 20:47:13 | Log-Likelihood: | -1131.8 |
| No. Observations: | 425 | AIC: | 2272. |
| Df Residuals: | 421 | BIC: | 2288. |
| Df Model: | 3 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 52.5169 | 2.195 | 23.923 | 0.000 | 48.202 | 56.832 |
| Saves | 0.4096 | 0.027 | 15.020 | 0.000 | 0.356 | 0.463 |
| Runs_Scored | 0.0877 | 0.002 | 43.881 | 0.000 | 0.084 | 0.092 |
| Earned_Run_Average | -12.6762 | 0.380 | -33.364 | 0.000 | -13.423 | -11.929 |
| Omnibus: | 1.451 | Durbin-Watson: | 1.987 |
|---|---|---|---|
| Prob(Omnibus): | 0.484 | Jarque-Bera (JB): | 1.424 |
| Skew: | 0.141 | Prob(JB): | 0.491 |
| Kurtosis: | 2.975 | Cond. No. | 9.93e+03 |
Conclusion: We will try to add another independent vairable to the model to see if it gets better
lm5 = smf.ols(formula='Games_Won ~ Saves + Runs_Scored + Earned_Run_Average + Shutout',data = x_train).fit()
lm5.summary()
| Dep. Variable: | Games_Won | R-squared: | 0.927 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.926 |
| Method: | Least Squares | F-statistic: | 1331. |
| Date: | Tue, 19 Dec 2023 | Prob (F-statistic): | 5.83e-237 |
| Time: | 20:49:57 | Log-Likelihood: | -1117.0 |
| No. Observations: | 425 | AIC: | 2244. |
| Df Residuals: | 420 | BIC: | 2264. |
| Df Model: | 4 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 45.8561 | 2.444 | 18.760 | 0.000 | 41.051 | 50.661 |
| Saves | 0.4002 | 0.026 | 15.142 | 0.000 | 0.348 | 0.452 |
| Runs_Scored | 0.0860 | 0.002 | 43.942 | 0.000 | 0.082 | 0.090 |
| Earned_Run_Average | -11.3503 | 0.440 | -25.823 | 0.000 | -12.214 | -10.486 |
| Shutout | 0.3107 | 0.057 | 5.496 | 0.000 | 0.200 | 0.422 |
| Omnibus: | 0.849 | Durbin-Watson: | 1.977 |
|---|---|---|---|
| Prob(Omnibus): | 0.654 | Jarque-Bera (JB): | 0.754 |
| Skew: | 0.102 | Prob(JB): | 0.686 |
| Kurtosis: | 3.030 | Cond. No. | 1.15e+04 |
Comment: All the measurement looks good but We will still try to add another independent vairable to the model to see if it gets better
lm6 = smf.ols(formula='Games_Won ~ Saves + Runs_Scored + Earned_Run_Average + Shutout + Walks',data = x_train).fit()
lm6.summary()
| Dep. Variable: | Games_Won | R-squared: | 0.927 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.926 |
| Method: | Least Squares | F-statistic: | 1069. |
| Date: | Tue, 19 Dec 2023 | Prob (F-statistic): | 6.77e-236 |
| Time: | 20:47:14 | Log-Likelihood: | -1115.8 |
| No. Observations: | 425 | AIC: | 2244. |
| Df Residuals: | 419 | BIC: | 2268. |
| Df Model: | 5 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 44.9275 | 2.512 | 17.884 | 0.000 | 39.989 | 49.866 |
| Saves | 0.4004 | 0.026 | 15.178 | 0.000 | 0.349 | 0.452 |
| Runs_Scored | 0.0835 | 0.003 | 33.043 | 0.000 | 0.079 | 0.088 |
| Earned_Run_Average | -11.2425 | 0.444 | -25.308 | 0.000 | -12.116 | -10.369 |
| Shutout | 0.3151 | 0.057 | 5.577 | 0.000 | 0.204 | 0.426 |
| Walks | 0.0042 | 0.003 | 1.556 | 0.120 | -0.001 | 0.010 |
| Omnibus: | 1.041 | Durbin-Watson: | 1.987 |
|---|---|---|---|
| Prob(Omnibus): | 0.594 | Jarque-Bera (JB): | 0.987 |
| Skew: | 0.118 | Prob(JB): | 0.611 |
| Kurtosis: | 2.999 | Cond. No. | 1.45e+04 |
Comment: R-squared and Adj. R-squared do not change, which means adding Shutout does not make the model better. So we will finalize our regression model with 4 variables only: Saves and Runs_Scored, Earned_Run_Average, Shutout
lm5 = smf.ols(formula='Games_Won ~ Saves + Runs_Scored + Earned_Run_Average + Shutout',data = x_train).fit()
lm5.summary()
| Dep. Variable: | Games_Won | R-squared: | 0.927 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.926 |
| Method: | Least Squares | F-statistic: | 1331. |
| Date: | Tue, 19 Dec 2023 | Prob (F-statistic): | 5.83e-237 |
| Time: | 20:47:15 | Log-Likelihood: | -1117.0 |
| No. Observations: | 425 | AIC: | 2244. |
| Df Residuals: | 420 | BIC: | 2264. |
| Df Model: | 4 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 45.8561 | 2.444 | 18.760 | 0.000 | 41.051 | 50.661 |
| Saves | 0.4002 | 0.026 | 15.142 | 0.000 | 0.348 | 0.452 |
| Runs_Scored | 0.0860 | 0.002 | 43.942 | 0.000 | 0.082 | 0.090 |
| Earned_Run_Average | -11.3503 | 0.440 | -25.823 | 0.000 | -12.214 | -10.486 |
| Shutout | 0.3107 | 0.057 | 5.496 | 0.000 | 0.200 | 0.422 |
| Omnibus: | 0.849 | Durbin-Watson: | 1.977 |
|---|---|---|---|
| Prob(Omnibus): | 0.654 | Jarque-Bera (JB): | 0.754 |
| Skew: | 0.102 | Prob(JB): | 0.686 |
| Kurtosis: | 3.030 | Cond. No. | 1.15e+04 |
After conducting a train/test split and exploiing the best options for the Linear Regression Model, we selected the following variables for the final model:
Evaluation
x_test.head()
| Year | Games_Won | Earned_Run_Average | Shutout | Saves | Strikeouts_Allowed | Walks | Runs_Scored | Hits | Doubles | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2112 | 1992 | 92 | 3.43 | 14 | 39 | 793 | 511 | 740 | 1477 | 272 |
| 2284 | 1998 | 88 | 3.77 | 16 | 46 | 1129 | 572 | 706 | 1425 | 289 |
| 2194 | 1995 | 78 | 3.66 | 11 | 37 | 1060 | 468 | 634 | 1303 | 191 |
| 2512 | 2006 | 78 | 4.41 | 13 | 24 | 948 | 556 | 870 | 1576 | 351 |
| 2234 | 1996 | 88 | 3.98 | 11 | 43 | 1050 | 495 | 759 | 1468 | 281 |
x_test.describe()
| Year | Games_Won | Earned_Run_Average | Shutout | Saves | Strikeouts_Allowed | Walks | Runs_Scored | Hits | Doubles | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 |
| mean | 2000.202186 | 79.852459 | 4.274153 | 8.715847 | 40.322404 | 1019.934426 | 534.584699 | 739.830601 | 1436.469945 | 277.956284 |
| std | 6.301718 | 12.015790 | 0.551943 | 4.036389 | 7.838331 | 140.752749 | 71.816786 | 87.186227 | 113.773237 | 36.096151 |
| min | 1990.000000 | 49.000000 | 3.140000 | 1.000000 | 20.000000 | 602.000000 | 336.000000 | 500.000000 | 966.000000 | 160.000000 |
| 25% | 1994.000000 | 71.000000 | 3.835000 | 6.000000 | 36.000000 | 931.000000 | 487.500000 | 687.500000 | 1398.000000 | 260.000000 |
| 50% | 2001.000000 | 80.000000 | 4.230000 | 9.000000 | 40.000000 | 1031.000000 | 534.000000 | 740.000000 | 1445.000000 | 281.000000 |
| 75% | 2006.000000 | 89.000000 | 4.585000 | 11.000000 | 45.000000 | 1114.000000 | 578.000000 | 799.000000 | 1499.500000 | 300.500000 |
| max | 2010.000000 | 114.000000 | 6.020000 | 20.000000 | 68.000000 | 1346.000000 | 707.000000 | 993.000000 | 1644.000000 | 358.000000 |
lm5.predict = lm5.predict(x_test)
predict_values=pd.concat([x_test['Year'],x_test['Games_Won'],lm5.predict],axis=1).set_index('Year')
predict_values.columns=['actual_Games_Won','predicted_Games_Won']
predict_values['residual']=predict_values['actual_Games_Won']-predict_values['predicted_Games_Won']
predict_values.head(20)
| actual_Games_Won | predicted_Games_Won | residual | |
|---|---|---|---|
| Year | |||
| 1992 | 92 | 90.530145 | 1.469855 |
| 1998 | 88 | 87.169064 | 0.830936 |
| 1995 | 78 | 77.069830 | 0.930170 |
| 2006 | 78 | 84.275542 | -6.275542 |
| 1996 | 88 | 86.590368 | 1.409632 |
| 1998 | 74 | 70.343301 | 3.656699 |
| 2002 | 98 | 90.921032 | 7.078968 |
| 2006 | 83 | 79.504364 | 3.495636 |
| 1998 | 98 | 96.105387 | 1.894613 |
| 1993 | 64 | 69.714215 | -5.714215 |
| 2008 | 92 | 92.766568 | -0.766568 |
| 1990 | 65 | 67.023082 | -2.023082 |
| 1999 | 84 | 84.364780 | -0.364780 |
| 1990 | 70 | 73.097472 | -3.097472 |
| 2000 | 85 | 86.651801 | -1.651801 |
| 1994 | 58 | 52.849387 | 5.150613 |
| 1995 | 74 | 67.247127 | 6.752873 |
| 1998 | 76 | 78.137328 | -2.137328 |
| 2008 | 84 | 81.719385 | 2.280615 |
| 1991 | 83 | 86.466305 | -3.466305 |
rmse=np.sqrt(metrics.mean_squared_error(predict_values['actual_Games_Won'],predict_values['predicted_Games_Won']))
print('RMSE metric for the residuals: %.4f' %rmse)
RMSE metric for the residuals: 3.4821
SI = rmse/predict_values['actual_Games_Won'].mean()*100
print('Scatter Index:', SI)
Scatter Index: 4.360714011847737
sns.regplot(data=predict_values, x='actual_Games_Won', y='predicted_Games_Won', x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, seed=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=True, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker='o', scatter_kws=None, line_kws=None, ax=None)
<Axes: xlabel='actual_Games_Won', ylabel='predicted_Games_Won'>
As suggested in Section 1, the thresholds for RMSE and SI is +/5 adnd 5% respectively, we can assume that expected accuracy of the prediction is acceptable
From the scatterplot, we can see that the residual do not contradict the linear assumption that means it's a good predictor of the Games Won in Period 4
df5=data_baseball[(data_baseball['Year']>=2012) & (data_baseball['Year']<=2015)]
df5.head(5)
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2685 | 2012 | ARI | ARI | 3 | 162 | 81 | 81 | 734 | 5462 | 1416 | ... | 39 | 4301 | 1432 | 155 | 417 | 1200 | 90 | 146.0 | 0.985 | Arizona Diamondbacks |
| 2686 | 2012 | ATL | ATL | 2 | 162 | 94 | 68 | 700 | 5425 | 1341 | ... | 47 | 4336 | 1310 | 145 | 464 | 1232 | 86 | 147.0 | 0.986 | Atlanta Braves |
| 2687 | 2012 | BAL | BAL | 2 | 162 | 93 | 69 | 712 | 5560 | 1375 | ... | 55 | 4449 | 1433 | 184 | 481 | 1177 | 106 | 151.0 | 0.983 | Baltimore Orioles |
| 2688 | 2012 | BOS | BOS | 5 | 162 | 69 | 93 | 734 | 5604 | 1459 | ... | 35 | 4329 | 1449 | 190 | 529 | 1176 | 101 | 159.0 | 0.983 | Boston Red Sox |
| 2689 | 2012 | CHA | CHW | 2 | 162 | 85 | 77 | 748 | 5518 | 1409 | ... | 37 | 4337 | 1365 | 186 | 503 | 1246 | 70 | 154.0 | 0.988 | Chicago White Sox |
5 rows × 31 columns
df5_NY=df5[df5['Team']=='NYA']
df5_NY
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2702 | 2012 | NYA | NYY | 1 | 162 | 95 | 67 | 804 | 5524 | 1462 | ... | 51 | 4336 | 1401 | 190 | 431 | 1318 | 75 | 135.0 | 0.987 | New York Yankees |
| 2732 | 2013 | NYA | NYY | 4 | 162 | 85 | 77 | 650 | 5449 | 1321 | ... | 49 | 4342 | 1452 | 171 | 437 | 1233 | 69 | 139.0 | 0.988 | New York Yankees |
| 2762 | 2014 | NYA | NYY | 2 | 162 | 84 | 78 | 633 | 5497 | 1349 | ... | 48 | 4359 | 1392 | 164 | 398 | 1370 | 92 | 107.0 | 0.984 | New York Yankees |
| 2781 | 2015 | NYA | NYY | 2 | 162 | 87 | 75 | 764 | 5567 | 1397 | ... | 48 | 4373 | 1416 | 182 | 474 | 1370 | 93 | 135.0 | 0.985 | New York Yankees |
4 rows × 31 columns
df5_TO=df5[df5['Team']=='TOR']
df5_TO
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2713 | 2012 | TOR | TOR | 4 | 162 | 73 | 89 | 716 | 5487 | 1346 | ... | 29 | 4331 | 1439 | 204 | 574 | 1142 | 101 | 167.0 | 0.984 | Toronto Blue Jays |
| 2743 | 2013 | TOR | TOR | 5 | 162 | 74 | 88 | 712 | 5537 | 1398 | ... | 39 | 4356 | 1451 | 195 | 500 | 1208 | 111 | 145.0 | 0.982 | Toronto Blue Jays |
| 2773 | 2014 | TOR | TOR | 3 | 162 | 83 | 79 | 723 | 5549 | 1435 | ... | 45 | 4329 | 1400 | 151 | 490 | 1199 | 87 | 130.0 | 0.985 | Toronto Blue Jays |
| 2780 | 2015 | TOR | TOR | 1 | 162 | 93 | 69 | 891 | 5509 | 1480 | ... | 34 | 4323 | 1353 | 173 | 397 | 1117 | 88 | 145.0 | 0.985 | Toronto Blue Jays |
4 rows × 31 columns
data_baseball_5=pd.concat([df5_NY,df5_TO],axis=0)
data_baseball_5.reset_index(drop=True)
| Year | Team | Franchise | Final_Standing | Games_Played | Games_Won | Games_Lost | Runs_Scored | At_Bats | Hits | ... | Saves | Infield_Put_Outs | Hits_Allowed | Home_Run_Allowed | Walks_Allowed | Strikeouts_Allowed | Errors | Double_Plays | Fielding_Percentage | Team_Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2012 | NYA | NYY | 1 | 162 | 95 | 67 | 804 | 5524 | 1462 | ... | 51 | 4336 | 1401 | 190 | 431 | 1318 | 75 | 135.0 | 0.987 | New York Yankees |
| 1 | 2013 | NYA | NYY | 4 | 162 | 85 | 77 | 650 | 5449 | 1321 | ... | 49 | 4342 | 1452 | 171 | 437 | 1233 | 69 | 139.0 | 0.988 | New York Yankees |
| 2 | 2014 | NYA | NYY | 2 | 162 | 84 | 78 | 633 | 5497 | 1349 | ... | 48 | 4359 | 1392 | 164 | 398 | 1370 | 92 | 107.0 | 0.984 | New York Yankees |
| 3 | 2015 | NYA | NYY | 2 | 162 | 87 | 75 | 764 | 5567 | 1397 | ... | 48 | 4373 | 1416 | 182 | 474 | 1370 | 93 | 135.0 | 0.985 | New York Yankees |
| 4 | 2012 | TOR | TOR | 4 | 162 | 73 | 89 | 716 | 5487 | 1346 | ... | 29 | 4331 | 1439 | 204 | 574 | 1142 | 101 | 167.0 | 0.984 | Toronto Blue Jays |
| 5 | 2013 | TOR | TOR | 5 | 162 | 74 | 88 | 712 | 5537 | 1398 | ... | 39 | 4356 | 1451 | 195 | 500 | 1208 | 111 | 145.0 | 0.982 | Toronto Blue Jays |
| 6 | 2014 | TOR | TOR | 3 | 162 | 83 | 79 | 723 | 5549 | 1435 | ... | 45 | 4329 | 1400 | 151 | 490 | 1199 | 87 | 130.0 | 0.985 | Toronto Blue Jays |
| 7 | 2015 | TOR | TOR | 1 | 162 | 93 | 69 | 891 | 5509 | 1480 | ... | 34 | 4323 | 1353 | 173 | 397 | 1117 | 88 | 145.0 | 0.985 | Toronto Blue Jays |
8 rows × 31 columns
lm5.predict2 = lm5.predict(data_baseball_5)
predict_values2=pd.concat([data_baseball_5['Year'],data_baseball_5['Games_Won'],lm5.predict2],axis=1).set_index('Year')
predict_values2.columns=['actual_Games_Won','predicted_Games_Won']
predict_values2['residual']=predict_values2['actual_Games_Won']-predict_values2['predicted_Games_Won']
predict_values2.head(20)
| actual_Games_Won | predicted_Games_Won | residual | |
|---|---|---|---|
| Year | |||
| 2012 | 95 | 94.629938 | 0.370062 |
| 2013 | 85 | 79.759212 | 5.240788 |
| 2014 | 84 | 80.053377 | 3.946623 |
| 2015 | 87 | 86.279008 | 0.720992 |
| 2012 | 73 | 69.798379 | 3.201621 |
| 2013 | 74 | 77.882529 | -3.882529 |
| 2014 | 83 | 85.620568 | -2.620568 |
| 2015 | 93 | 96.075066 | -3.075066 |
rmse=np.sqrt(metrics.mean_squared_error(predict_values2['actual_Games_Won'],predict_values2['predicted_Games_Won']))
print(rmse)
3.2662343220952215
sns.regplot(data=predict_values2, x='actual_Games_Won', y='predicted_Games_Won', x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, seed=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=True, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker='o', scatter_kws=None, line_kws=None, ax=None)
<Axes: xlabel='actual_Games_Won', ylabel='predicted_Games_Won'>
As suggested in Section 1, the thresholds for RMSE and SI is +/5 adnd 5% respectively, we can assume that expected accuracy of the prediction is acceptable
From the scatterplot, we can see that the residual do not contradict the linear assumption that means it's a good predictor of the Games Won for New York Yankees and Toronto Blue Jays from 2012 to 2015
Data Cleaning process: we detected variables that have missing data and used appropriate method to either remove it from the dataset or use it for the period that it has full data. This elimates the chance that we might remove important variables that can possible have a strong correlation with Games Won in a certain period
After completing the data analysis process, we are able to determine the style of play for each period from 4 periods from 1871 to 2010. Also, with 95% level of confidence, we also built a model that can forecast future results by making predictions based on previous data (period 1990 to 2010) with accuracy of the prediction is +/ over 3
Throughout the project, we have used correlation calculation to determine the correlation relationship between dependent variable and independent variables, and determine the style of play for each period. QQ plots, Scatter Plots, Scatter Matrix, Box Plots, and Histogram are also used to check the normality, linear relationships, Outliers, etc. This meets the analytical objectives that we set in Section 1
Recommendations for next steps